• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 20361865516

19 Dec 2025 06:25AM UTC coverage: 89.13% (-0.002%) from 89.132%
20361865516

push

github

bramp
Fix lint errors: line length, unused imports, and YAML issues

- Add ruff isort configuration with known-first-party for build_a_long
- Add per-file E501 ignore for legocom_test.py (JSON test data)
- Create .yamllint config to relax strict YAML rules
- Fix E501 line length errors by wrapping long comments and strings
- Fix F841 unused variable errors
- Fix PLC0415 import-at-non-top-level errors
- Fix SIM108 ternary simplification errors

12 of 14 new or added lines in 8 files covered. (85.71%)

78 existing lines in 6 files now uncovered.

12915 of 14490 relevant lines covered (89.13%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

25.55
/src/build_a_long/pdf_extract/cli/unassigned_diagnostics.py
1
"""Diagnostic utilities for analyzing unassigned blocks.
2

3
This module provides tools to categorize and explain why blocks weren't assigned
4
to any LEGO page element. It helps identify patterns in unassigned blocks and
5
provides actionable recommendations.
6
"""
7

8
from __future__ import annotations
1✔
9

10
from collections import defaultdict
1✔
11
from dataclasses import dataclass
1✔
12
from enum import Enum, auto
1✔
13
from typing import TYPE_CHECKING
1✔
14

15
from build_a_long.pdf_extract.extractor.page_blocks import Blocks, Drawing, Image, Text
1✔
16

17
if TYPE_CHECKING:
18
    from build_a_long.pdf_extract.classifier.classification_result import (
19
        ClassificationResult,
20
    )
21
    from build_a_long.pdf_extract.extractor.extractor import PageData
22

23

24
class UnassignedCategory(Enum):
1✔
25
    """Categories of unassigned blocks."""
26

27
    ZERO_WIDTH = auto()
1✔
28
    """Drawing with zero width (x0 == x1)"""
1✔
29

30
    ZERO_HEIGHT = auto()
1✔
31
    """Drawing with zero height (y0 == y1)"""
1✔
32

33
    PAGE_EDGE_LINE = auto()
1✔
34
    """Line at page boundary (x=0 or x=page_width)"""
1✔
35

36
    WHITESPACE_TEXT = auto()
1✔
37
    """Text containing only whitespace"""
1✔
38

39
    COPYRIGHT_TEXT = auto()
1✔
40
    """Copyright or trademark text on info pages"""
1✔
41

42
    SMALL_DOT = auto()
1✔
43
    """Very small drawing (likely a dot or artifact)"""
1✔
44

45
    LARGE_UNCLASSIFIED = auto()
1✔
46
    """Large drawing that didn't match any classifier"""
1✔
47

48
    IMAGE_IN_COMPLEX_PAGE = auto()
1✔
49
    """Image on catalog/info page that wasn't assigned"""
1✔
50

51
    UNKNOWN = auto()
1✔
52
    """Block that doesn't fit known categories"""
1✔
53

54

55
@dataclass
1✔
56
class UnassignedBlockInfo:
1✔
57
    """Information about an unassigned block."""
58

59
    block: Blocks
1✔
60
    category: UnassignedCategory
1✔
61
    reason: str
1✔
62
    recommendation: str
1✔
63

64

65
def categorize_unassigned_block(
1✔
66
    block: Blocks,
67
    page_data: PageData,
68
) -> UnassignedBlockInfo:
69
    """Categorize an unassigned block and provide actionable information.
70

71
    Args:
72
        block: The unassigned block
73
        page_data: Page data for context (page dimensions, etc.)
74

75
    Returns:
76
        UnassignedBlockInfo with category, reason, and recommendation
77
    """
78
    page_width = page_data.bbox.width
×
79
    page_height = page_data.bbox.height
×
80

81
    # Check for zero-dimension drawings
82
    if isinstance(block, Drawing):
×
83
        bbox = block.bbox
×
84
        width = bbox.width
×
85
        height = bbox.height
×
86

87
        # Zero width
88
        if width == 0:
×
89
            is_at_edge = bbox.x0 == 0 or bbox.x0 == page_width
×
90
            if is_at_edge:
×
91
                return UnassignedBlockInfo(
×
92
                    block=block,
93
                    category=UnassignedCategory.PAGE_EDGE_LINE,
94
                    reason=f"Zero-width line at page edge (x={bbox.x0})",
95
                    recommendation="Filter page-edge lines in block_filter.py",
96
                )
97
            return UnassignedBlockInfo(
×
98
                block=block,
99
                category=UnassignedCategory.ZERO_WIDTH,
100
                reason=f"Zero-width drawing at x={bbox.x0}",
101
                recommendation="Filter zero-width drawings in block_filter.py",
102
            )
103

104
        # Zero height
105
        if height == 0:
×
106
            is_at_edge = bbox.y0 == 0 or bbox.y0 == page_height
×
107
            if is_at_edge:
×
108
                return UnassignedBlockInfo(
×
109
                    block=block,
110
                    category=UnassignedCategory.PAGE_EDGE_LINE,
111
                    reason=f"Zero-height line at page edge (y={bbox.y0})",
112
                    recommendation="Filter page-edge lines in block_filter.py",
113
                )
114
            return UnassignedBlockInfo(
×
115
                block=block,
116
                category=UnassignedCategory.ZERO_HEIGHT,
117
                reason=f"Zero-height drawing at y={bbox.y0}",
118
                recommendation="Filter zero-height drawings in block_filter.py",
119
            )
120

121
        # Small dot (area < 25 sq pts, roughly 5x5 or less)
122
        if bbox.area < 25:
×
123
            return UnassignedBlockInfo(
×
124
                block=block,
125
                category=UnassignedCategory.SMALL_DOT,
126
                reason=f"Very small drawing (area={bbox.area:.1f} sq pts)",
127
                recommendation="Consider if this is a significant element or artifact",
128
            )
129

130
        # Large unclassified drawing (> 5% of page area)
131
        page_area = page_width * page_height
×
132
        if bbox.area > page_area * 0.05:
×
133
            return UnassignedBlockInfo(
×
134
                block=block,
135
                category=UnassignedCategory.LARGE_UNCLASSIFIED,
136
                reason=f"Large drawing ({bbox.area / page_area * 100:.1f}% of page)",
137
                recommendation=(
138
                    "Review if this should be a background, diagram, or other element"
139
                ),
140
            )
141

142
    # Check for whitespace-only text
143
    if isinstance(block, Text):
×
144
        if block.text.strip() == "":
×
145
            return UnassignedBlockInfo(
×
146
                block=block,
147
                category=UnassignedCategory.WHITESPACE_TEXT,
148
                reason="Text contains only whitespace",
149
                recommendation="Filter whitespace-only text in block_filter.py",
150
            )
151

152
        # Check for copyright/trademark text
153
        copyright_keywords = {
×
154
            "©",
155
            "™",
156
            "®",
157
            "copyright",
158
            "trademark",
159
            "lego.com",
160
            "lucasfilm",
161
            "disney",
162
            "marcas registradas",
163
        }
164
        text_lower = block.text.lower()
×
165
        if any(kw in text_lower for kw in copyright_keywords):
×
166
            return UnassignedBlockInfo(
×
167
                block=block,
168
                category=UnassignedCategory.COPYRIGHT_TEXT,
169
                reason=f"Copyright/trademark text: '{block.text[:40]}...'",
170
                recommendation="Add classifier for legal/copyright text",
171
            )
172

173
    # Check for unclassified images
174
    if isinstance(block, Image):
×
175
        return UnassignedBlockInfo(
×
176
            block=block,
177
            category=UnassignedCategory.IMAGE_IN_COMPLEX_PAGE,
178
            reason="Image not assigned to any element",
179
            recommendation=(
180
                "Review if this should be a diagram, part_image, or other element"
181
            ),
182
        )
183

184
    # Default: unknown category
185
    return UnassignedBlockInfo(
×
186
        block=block,
187
        category=UnassignedCategory.UNKNOWN,
188
        reason="Does not match known unassigned patterns",
189
        recommendation="Manual review required",
190
    )
191

192

193
def get_unassigned_blocks(result: ClassificationResult) -> list[Blocks]:
1✔
194
    """Get all unassigned blocks from a classification result.
195

196
    Args:
197
        result: Classification result to check
198

199
    Returns:
200
        List of blocks that are unassigned (no candidate and not removed)
201
    """
202
    unassigned = []
×
203
    for block in result.page_data.blocks:
×
204
        # Check if block is assigned to a constructed candidate
205
        best_candidate = result.get_best_candidate(block)
×
206
        if best_candidate:
×
207
            continue
×
208

209
        # Check if block was explicitly removed
210
        if result.is_removed(block):
×
211
            continue
×
212

213
        # Block is unassigned
214
        unassigned.append(block)
×
215

216
    return unassigned
×
217

218

219
def analyze_unassigned_blocks(
1✔
220
    result: ClassificationResult,
221
) -> dict[UnassignedCategory, list[UnassignedBlockInfo]]:
222
    """Analyze all unassigned blocks and group by category.
223

224
    Args:
225
        result: Classification result to analyze
226

227
    Returns:
228
        Dictionary mapping categories to lists of UnassignedBlockInfo
229
    """
230
    categorized: dict[UnassignedCategory, list[UnassignedBlockInfo]] = defaultdict(list)
×
231

232
    unassigned = get_unassigned_blocks(result)
×
233
    for block in unassigned:
×
234
        info = categorize_unassigned_block(block, result.page_data)
×
235
        categorized[info.category].append(info)
×
236

237
    return dict(categorized)
×
238

239

240
def print_unassigned_diagnostics(
1✔
241
    results: list[ClassificationResult],
242
    *,
243
    show_details: bool = True,
244
) -> None:
245
    """Print diagnostic report for unassigned blocks across all pages.
246

247
    Args:
248
        results: List of classification results
249
        show_details: If True, show individual block details
250
    """
251
    # Aggregate statistics
252
    total_unassigned = 0
×
253
    category_counts: dict[UnassignedCategory, int] = defaultdict(int)
×
254
    pages_with_unassigned: list[tuple[int, dict]] = []
×
255

256
    for result in results:
×
257
        if result.skipped_reason:
×
258
            continue
×
259

260
        analysis = analyze_unassigned_blocks(result)
×
261
        if not analysis:
×
262
            continue
×
263

264
        page_num = result.page_data.page_number
×
265
        page_total = sum(len(blocks) for blocks in analysis.values())
×
266
        total_unassigned += page_total
×
267

268
        for category, blocks in analysis.items():
×
269
            category_counts[category] += len(blocks)
×
270

271
        pages_with_unassigned.append((page_num, analysis))
×
272

273
    if total_unassigned == 0:
×
274
        print("\n✓ All blocks are assigned!")
×
275
        return
×
276

277
    print(f"\n{'=' * 80}")
×
278
    print("UNASSIGNED BLOCK DIAGNOSTICS")
×
279
    print(f"{'=' * 80}")
×
280
    print(f"\nTotal unassigned blocks: {total_unassigned}")
×
281
    print(f"Pages with unassigned blocks: {len(pages_with_unassigned)}")
×
282

283
    # Print category summary
284
    print("\nBy Category:")
×
285
    print("-" * 60)
×
286
    sorted_categories = sorted(
×
287
        category_counts.items(), key=lambda x: x[1], reverse=True
288
    )
289
    for category, count in sorted_categories:
×
290
        pct = count / total_unassigned * 100
×
291
        print(f"  {category.name:25} {count:5d} ({pct:5.1f}%)")
×
292

293
    # Print recommendations
294
    print("\nRecommendations:")
×
295
    print("-" * 60)
×
296
    recommendations: dict[str, int] = defaultdict(int)
×
297

298
    for _page_num, analysis in pages_with_unassigned:
×
299
        for _category, blocks in analysis.items():
×
300
            for info in blocks:
×
301
                recommendations[info.recommendation] += 1
×
302

303
    for rec, count in sorted(recommendations.items(), key=lambda x: -x[1]):
×
304
        print(f"  • {rec} ({count} blocks)")
×
305

306
    # Print per-page details if requested
307
    if show_details:
×
308
        print(f"\n{'=' * 80}")
×
309
        print("PER-PAGE DETAILS")
×
310
        print(f"{'=' * 80}")
×
311

312
        for page_num, analysis in pages_with_unassigned:
×
313
            page_total = sum(len(blocks) for blocks in analysis.values())
×
314
            print(f"\nPage {page_num}: {page_total} unassigned blocks")
×
315

316
            for category, blocks in sorted(analysis.items(), key=lambda x: x[0].name):
×
317
                if not blocks:
×
318
                    continue
×
319
                print(f"  {category.name}:")
×
320
                for info in blocks[:5]:  # Limit to 5 per category per page
×
321
                    block = info.block
×
NEW
322
                    bbox_str = (
×
323
                        f"({block.bbox.x0:.1f},{block.bbox.y0:.1f},"
324
                        f"{block.bbox.x1:.1f},{block.bbox.y1:.1f})"
325
                    )
326
                    print(f"    #{block.id} {type(block).__name__} {bbox_str}")
×
327
                    print(f"       Reason: {info.reason}")
×
328

329
                if len(blocks) > 5:
×
330
                    print(f"    ... and {len(blocks) - 5} more")
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc