• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 20400711546

20 Dec 2025 10:09PM UTC coverage: 89.367% (+0.006%) from 89.361%
20400711546

push

github

bramp
docs: Add comprehensive Classifier best practices documentation

- Add detailed docstrings to Classifier and RuleBasedClassifier classes
  covering all aspects of writing robust classifiers
- Document scoring phase: API access rules, Score object design,
  intrinsic vs relationship-based scoring
- Document build phase: source block rules, exception handling,
  construction patterns
- Document build_all(): when to use for global coordination
- Add complete code examples for atomic and composite patterns
- Fix DESIGN.md contradiction about Score objects storing candidates
- Update README.md and DESIGN.md to reference class docstrings as
  single source of truth
- Add recommendations to use RuleBasedClassifier for atomic classifiers

This consolidates documentation to reduce duplication and provides
clear guidelines for both humans and AI agents writing new classifiers.

13708 of 15339 relevant lines covered (89.37%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

26.09
/src/build_a_long/pdf_extract/cli/unconsumed_diagnostics.py
1
"""Diagnostic utilities for analyzing unconsumed blocks.
2

3
This module provides tools to categorize and explain why blocks weren't consumed
4
by any LEGO page element. It helps identify patterns in unconsumed blocks and
5
provides actionable recommendations.
6
"""
7

8
from __future__ import annotations
1✔
9

10
from collections import defaultdict
1✔
11
from dataclasses import dataclass
1✔
12
from enum import Enum, auto
1✔
13

14
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
15
    ClassificationResult,
16
)
17
from build_a_long.pdf_extract.extractor.extractor import PageData
1✔
18
from build_a_long.pdf_extract.extractor.page_blocks import Blocks, Drawing, Image, Text
1✔
19

20

21
class UnconsumedCategory(Enum):
1✔
22
    """Categories of unconsumed blocks."""
23

24
    ZERO_WIDTH = auto()
1✔
25
    """Drawing with zero width (x0 == x1)"""
1✔
26

27
    ZERO_HEIGHT = auto()
1✔
28
    """Drawing with zero height (y0 == y1)"""
1✔
29

30
    PAGE_EDGE_LINE = auto()
1✔
31
    """Line at page boundary (x=0 or x=page_width)"""
1✔
32

33
    WHITESPACE_TEXT = auto()
1✔
34
    """Text containing only whitespace"""
1✔
35

36
    COPYRIGHT_TEXT = auto()
1✔
37
    """Copyright or trademark text on info pages"""
1✔
38

39
    SMALL_DOT = auto()
1✔
40
    """Very small drawing (likely a dot or artifact)"""
1✔
41

42
    LARGE_UNCLASSIFIED = auto()
1✔
43
    """Large drawing that didn't match any classifier"""
1✔
44

45
    IMAGE_IN_COMPLEX_PAGE = auto()
1✔
46
    """Image on catalog/info page that wasn't assigned"""
1✔
47

48
    UNKNOWN = auto()
1✔
49
    """Block that doesn't fit known categories"""
1✔
50

51

52
@dataclass
1✔
53
class UnconsumedBlockInfo:
1✔
54
    """Information about an unconsumed block."""
55

56
    block: Blocks
1✔
57
    category: UnconsumedCategory
1✔
58
    reason: str
1✔
59
    recommendation: str
1✔
60

61

62
def categorize_unconsumed_block(
1✔
63
    block: Blocks,
64
    page_data: PageData,
65
) -> UnconsumedBlockInfo:
66
    """Categorize an unconsumed block and provide actionable information.
67

68
    Args:
69
        block: The unconsumed block
70
        page_data: Page data for context (page dimensions, etc.)
71

72
    Returns:
73
        UnconsumedBlockInfo with category, reason, and recommendation
74
    """
75
    page_width = page_data.bbox.width
×
76
    page_height = page_data.bbox.height
×
77

78
    # Check for zero-dimension drawings
79
    if isinstance(block, Drawing):
×
80
        bbox = block.bbox
×
81
        width = bbox.width
×
82
        height = bbox.height
×
83

84
        # Zero width
85
        if width == 0:
×
86
            is_at_edge = bbox.x0 == 0 or bbox.x0 == page_width
×
87
            if is_at_edge:
×
88
                return UnconsumedBlockInfo(
×
89
                    block=block,
90
                    category=UnconsumedCategory.PAGE_EDGE_LINE,
91
                    reason=f"Zero-width line at page edge (x={bbox.x0})",
92
                    recommendation="Filter page-edge lines in block_filter.py",
93
                )
94
            return UnconsumedBlockInfo(
×
95
                block=block,
96
                category=UnconsumedCategory.ZERO_WIDTH,
97
                reason=f"Zero-width drawing at x={bbox.x0}",
98
                recommendation="Filter zero-width drawings in block_filter.py",
99
            )
100

101
        # Zero height
102
        if height == 0:
×
103
            is_at_edge = bbox.y0 == 0 or bbox.y0 == page_height
×
104
            if is_at_edge:
×
105
                return UnconsumedBlockInfo(
×
106
                    block=block,
107
                    category=UnconsumedCategory.PAGE_EDGE_LINE,
108
                    reason=f"Zero-height line at page edge (y={bbox.y0})",
109
                    recommendation="Filter page-edge lines in block_filter.py",
110
                )
111
            return UnconsumedBlockInfo(
×
112
                block=block,
113
                category=UnconsumedCategory.ZERO_HEIGHT,
114
                reason=f"Zero-height drawing at y={bbox.y0}",
115
                recommendation="Filter zero-height drawings in block_filter.py",
116
            )
117

118
        # Small dot (area < 25 sq pts, roughly 5x5 or less)
119
        if bbox.area < 25:
×
120
            return UnconsumedBlockInfo(
×
121
                block=block,
122
                category=UnconsumedCategory.SMALL_DOT,
123
                reason=f"Very small drawing (area={bbox.area:.1f} sq pts)",
124
                recommendation="Consider if this is a significant element or artifact",
125
            )
126

127
        # Large unclassified drawing (> 5% of page area)
128
        page_area = page_width * page_height
×
129
        if bbox.area > page_area * 0.05:
×
130
            return UnconsumedBlockInfo(
×
131
                block=block,
132
                category=UnconsumedCategory.LARGE_UNCLASSIFIED,
133
                reason=f"Large drawing ({bbox.area / page_area * 100:.1f}% of page)",
134
                recommendation=(
135
                    "Review if this should be a background, diagram, or other element"
136
                ),
137
            )
138

139
    # Check for whitespace-only text
140
    if isinstance(block, Text):
×
141
        if block.text.strip() == "":
×
142
            return UnconsumedBlockInfo(
×
143
                block=block,
144
                category=UnconsumedCategory.WHITESPACE_TEXT,
145
                reason="Text contains only whitespace",
146
                recommendation="Filter whitespace-only text in block_filter.py",
147
            )
148

149
        # Check for copyright/trademark text
150
        copyright_keywords = {
×
151
            "©",
152
            "™",
153
            "®",
154
            "copyright",
155
            "trademark",
156
            "lego.com",
157
            "lucasfilm",
158
            "disney",
159
            "marcas registradas",
160
        }
161
        text_lower = block.text.lower()
×
162
        if any(kw in text_lower for kw in copyright_keywords):
×
163
            return UnconsumedBlockInfo(
×
164
                block=block,
165
                category=UnconsumedCategory.COPYRIGHT_TEXT,
166
                reason=f"Copyright/trademark text: '{block.text[:40]}...'",
167
                recommendation="Add classifier for legal/copyright text",
168
            )
169

170
    # Check for unclassified images
171
    if isinstance(block, Image):
×
172
        return UnconsumedBlockInfo(
×
173
            block=block,
174
            category=UnconsumedCategory.IMAGE_IN_COMPLEX_PAGE,
175
            reason="Image not consumed by any element",
176
            recommendation=(
177
                "Review if this should be a diagram, part_image, or other element"
178
            ),
179
        )
180

181
    # Default: unknown category
182
    return UnconsumedBlockInfo(
×
183
        block=block,
184
        category=UnconsumedCategory.UNKNOWN,
185
        reason="Does not match known unconsumed patterns",
186
        recommendation="Manual review required",
187
    )
188

189

190
def get_unconsumed_blocks(result: ClassificationResult) -> list[Blocks]:
1✔
191
    """Get all unconsumed blocks from a classification result.
192

193
    Args:
194
        result: Classification result to check
195

196
    Returns:
197
        List of blocks that are unconsumed (no candidate and not removed)
198
    """
199
    unconsumed = []
×
200
    for block in result.page_data.blocks:
×
201
        # Check if block is consumed by a constructed candidate
202
        best_candidate = result.get_best_candidate(block)
×
203
        if best_candidate:
×
204
            continue
×
205

206
        # Check if block was explicitly removed
207
        if result.is_removed(block):
×
208
            continue
×
209

210
        # Block is unconsumed
211
        unconsumed.append(block)
×
212

213
    return unconsumed
×
214

215

216
def analyze_unconsumed_blocks(
1✔
217
    result: ClassificationResult,
218
) -> dict[UnconsumedCategory, list[UnconsumedBlockInfo]]:
219
    """Analyze all unconsumed blocks and group by category.
220

221
    Args:
222
        result: Classification result to analyze
223

224
    Returns:
225
        Dictionary mapping categories to lists of UnconsumedBlockInfo
226
    """
227
    categorized: dict[UnconsumedCategory, list[UnconsumedBlockInfo]] = defaultdict(list)
×
228

229
    unconsumed = get_unconsumed_blocks(result)
×
230
    for block in unconsumed:
×
231
        info = categorize_unconsumed_block(block, result.page_data)
×
232
        categorized[info.category].append(info)
×
233

234
    return dict(categorized)
×
235

236

237
def print_unconsumed_diagnostics(
1✔
238
    results: list[ClassificationResult],
239
    *,
240
    show_details: bool = True,
241
) -> None:
242
    """Print diagnostic report for unconsumed blocks across all pages.
243

244
    Args:
245
        results: List of classification results
246
        show_details: If True, show individual block details
247
    """
248
    # Aggregate statistics
249
    total_unconsumed = 0
×
250
    category_counts: dict[UnconsumedCategory, int] = defaultdict(int)
×
251
    pages_with_unconsumed: list[tuple[int, dict]] = []
×
252

253
    for result in results:
×
254
        if result.skipped_reason:
×
255
            continue
×
256

257
        analysis = analyze_unconsumed_blocks(result)
×
258
        if not analysis:
×
259
            continue
×
260

261
        page_num = result.page_data.page_number
×
262
        page_total = sum(len(blocks) for blocks in analysis.values())
×
263
        total_unconsumed += page_total
×
264

265
        for category, blocks in analysis.items():
×
266
            category_counts[category] += len(blocks)
×
267

268
        pages_with_unconsumed.append((page_num, analysis))
×
269

270
    if total_unconsumed == 0:
×
271
        print("\n✓ All blocks are consumed!")
×
272
        return
×
273

274
    print(f"\n{'=' * 80}")
×
275
    print("UNCONSUMED BLOCK DIAGNOSTICS")
×
276
    print(f"{'=' * 80}")
×
277
    print(f"\nTotal unconsumed blocks: {total_unconsumed}")
×
278
    print(f"Pages with unconsumed blocks: {len(pages_with_unconsumed)}")
×
279

280
    # Print category summary
281
    print("\nBy Category:")
×
282
    print("-" * 60)
×
283
    sorted_categories = sorted(
×
284
        category_counts.items(), key=lambda x: x[1], reverse=True
285
    )
286
    for category, count in sorted_categories:
×
287
        pct = count / total_unconsumed * 100
×
288
        print(f"  {category.name:25} {count:5d} ({pct:5.1f}%)")
×
289

290
    # Print recommendations
291
    print("\nRecommendations:")
×
292
    print("-" * 60)
×
293
    recommendations: dict[str, int] = defaultdict(int)
×
294

295
    for _page_num, analysis in pages_with_unconsumed:
×
296
        for _category, blocks in analysis.items():
×
297
            for info in blocks:
×
298
                recommendations[info.recommendation] += 1
×
299

300
    for rec, count in sorted(recommendations.items(), key=lambda x: -x[1]):
×
301
        print(f"  • {rec} ({count} blocks)")
×
302

303
    # Print per-page details if requested
304
    if show_details:
×
305
        print(f"\n{'=' * 80}")
×
306
        print("PER-PAGE DETAILS")
×
307
        print(f"{'=' * 80}")
×
308

309
        for page_num, analysis in pages_with_unconsumed:
×
310
            page_total = sum(len(blocks) for blocks in analysis.values())
×
311
            print(f"\nPage {page_num}: {page_total} unconsumed blocks")
×
312

313
            for category, blocks in sorted(analysis.items(), key=lambda x: x[0].name):
×
314
                if not blocks:
×
315
                    continue
×
316
                print(f"  {category.name}:")
×
317
                for info in blocks[:5]:  # Limit to 5 per category per page
×
318
                    block = info.block
×
319
                    bbox_str = (
×
320
                        f"({block.bbox.x0:.1f},{block.bbox.y0:.1f},"
321
                        f"{block.bbox.x1:.1f},{block.bbox.y1:.1f})"
322
                    )
323
                    print(f"    #{block.id} {type(block).__name__} {bbox_str}")
×
324
                    print(f"       Reason: {info.reason}")
×
325

326
                if len(blocks) > 5:
×
327
                    print(f"    ... and {len(blocks) - 5} more")
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc