• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19521244091

15 Nov 2025 02:10AM UTC coverage: 90.833% (-0.4%) from 91.217%
19521244091

push

github

bramp
refactor: remove unused code and simplify domain invariant tests

Removed approximately 220 lines of unused/redundant code from classifier tests:

classifier_rules_test.py:
- Removed ClassifiedPage wrapper class (~110 lines) - never instantiated
- Removed helper functions (_parts_lists, _part_images, _part_counts,
  _print_label_counts) - never called
- Cleaned up unused imports (defaultdict, Block, ClassificationResult)
- Updated docstring to reflect remaining test coverage

domain_invariants_test.py:
- Simplified all 4 tests to use result.page property directly
- Replaced verbose 6-line get_candidates() pattern with simple property access
- Removed redundant isinstance(page, Page) assertions (~48 lines total)
- Tests now more clearly express intent: validate Page/PartsList/Part objects

All tests continue to pass. No functionality was lost.

4 of 4 new or added lines in 2 files covered. (100.0%)

151 existing lines in 7 files now uncovered.

4994 of 5498 relevant lines covered (90.83%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

27.31
/src/build_a_long/pdf_extract/cli/reporting.py
1
"""Reporting and output formatting for PDF extraction."""
2

3
import logging
1✔
4
from collections import defaultdict
1✔
5
from typing import Any
1✔
6

7
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
8
    ClassificationResult,
9
)
10
from build_a_long.pdf_extract.classifier.font_size_hints import FontSizeHints
1✔
11
from build_a_long.pdf_extract.classifier.text_histogram import TextHistogram
1✔
12
from build_a_long.pdf_extract.extractor import PageData
1✔
13
from build_a_long.pdf_extract.extractor.hierarchy import build_hierarchy_from_blocks
1✔
14
from build_a_long.pdf_extract.extractor.lego_page_elements import Page
1✔
15
from build_a_long.pdf_extract.extractor.page_blocks import Block
1✔
16

17
logger = logging.getLogger(__name__)
1✔
18

19
# ANSI color codes
20
GREY = "\033[90m"
1✔
21
RESET = "\033[0m"
1✔
22

23

24
def print_summary(
1✔
25
    pages: list[PageData],
26
    results: list[ClassificationResult],
27
    *,
28
    detailed: bool = False,
29
) -> None:
30
    """Print a human-readable summary of classification results to stdout.
31

32
    Args:
33
        pages: List of PageData containing extracted elements
34
        results: List of ClassificationResult with labels
35
        detailed: If True, include additional details like missing page numbers
36
    """
37
    total_pages = len(pages)
1✔
38
    total_blocks = 0
1✔
39
    blocks_by_type: dict[str, int] = {}
1✔
40
    labeled_counts: dict[str, int] = {}
1✔
41

42
    pages_with_page_number = 0
1✔
43
    missing_page_numbers: list[int] = []
1✔
44

45
    for page, result in zip(pages, results, strict=True):
1✔
46
        total_blocks += len(page.blocks)
1✔
47
        # Tally block types and labels
48
        has_page_number = False
1✔
49
        for block in page.blocks:
1✔
50
            t = block.__class__.__name__.lower()
1✔
51
            blocks_by_type[t] = blocks_by_type.get(t, 0) + 1
1✔
52

53
            label = result.get_label(block)
1✔
54
            if label:
1✔
55
                labeled_counts[label] = labeled_counts.get(label, 0) + 1
1✔
56
                if label == "page_number":
1✔
UNCOV
57
                    has_page_number = True
×
58

59
        if has_page_number:
1✔
UNCOV
60
            pages_with_page_number += 1
×
61
        else:
62
            missing_page_numbers.append(page.page_number)
1✔
63

64
    coverage = (pages_with_page_number / total_pages * 100.0) if total_pages else 0.0
1✔
65

66
    # Human-friendly, single-shot summary
67
    print("=== Classification summary ===")
1✔
68
    print(f"Pages processed: {total_pages}")
1✔
69
    print(f"Total blocks: {total_blocks}")
1✔
70
    if blocks_by_type:
1✔
71
        parts = [f"{k}={v}" for k, v in sorted(blocks_by_type.items())]
1✔
72
        print("Elements by type: " + ", ".join(parts))
1✔
73
    if labeled_counts:
1✔
74
        parts = [f"{k}={v}" for k, v in sorted(labeled_counts.items())]
1✔
75
        print("Labeled elements: " + ", ".join(parts))
1✔
76
    print(
1✔
77
        f"Page-number coverage: {pages_with_page_number}/{total_pages} "
78
        f"({coverage:.1f}%)"
79
    )
80
    if detailed and missing_page_numbers:
1✔
81
        sample = ", ".join(str(n) for n in missing_page_numbers[:20])
×
82
        more = " ..." if len(missing_page_numbers) > 20 else ""
×
UNCOV
83
        print(f"Pages missing page number: {sample}{more}")
×
84

85

86
def _print_font_size_distribution(
1✔
87
    title: str,
88
    counter: Any,
89
    *,
90
    max_items: int = 10,
91
    empty_message: str = "(no data)",
92
    total_label: str = "Total text elements",
93
    unique_label: str = "Total unique sizes",
94
) -> None:
95
    """Print a font size distribution with bar chart.
96

97
    Args:
98
        title: Section title to display
99
        counter: Counter/dict mapping font sizes to counts
100
        max_items: Maximum number of items to display
101
        empty_message: Message to show when counter is empty
102
        total_label: Label for total count summary
103
        unique_label: Label for unique size count
104
    """
105
    print(title)
×
UNCOV
106
    print("-" * 60)
×
107

UNCOV
108
    total = sum(counter.values())
×
109

110
    if total > 0:
×
111
        print(f"{'Size':>8} | {'Count':>6} | Distribution")
×
UNCOV
112
        print("-" * 60)
×
113

114
        # Get most common items
115
        if hasattr(counter, "most_common"):
×
UNCOV
116
            items = counter.most_common(max_items)
×
117
        else:
UNCOV
118
            items = sorted(counter.items(), key=lambda x: x[1], reverse=True)[
×
119
                :max_items
120
            ]
121

122
        max_count = items[0][1] if items else 1
×
123
        for size, count in items:
×
124
            bar_length = int((count / max_count) * 30)
×
125
            bar = "█" * bar_length
×
UNCOV
126
            print(f"{size:8.1f} | {count:6d} | {bar}")
×
127

128
        print("-" * 60)
×
129
        print(f"{unique_label}: {len(counter)}")
×
UNCOV
130
        print(f"{total_label}: {total}")
×
131
    else:
132
        print(empty_message)
×
UNCOV
133
    print()
×
134

135

136
def print_histogram(histogram: TextHistogram) -> None:
1✔
137
    """Print the text histogram showing font size and name distributions.
138

139
    Args:
140
        histogram: TextHistogram containing font statistics across all pages
141
    """
142
    print("=== Text Histogram ===")
×
UNCOV
143
    print()
×
144

145
    # 1. Part counts (\dx pattern) - calculated first
UNCOV
146
    _print_font_size_distribution(
×
147
        "1. Part Count Font Sizes (\\dx pattern, e.g., '2x', '3x'):",
148
        histogram.part_count_font_sizes,
149
        empty_message="(no part count data)",
150
        total_label="Total part counts",
151
    )
152

153
    # 2. Page numbers (±1) - calculated second
UNCOV
154
    _print_font_size_distribution(
×
155
        "2. Page Number Font Sizes (digits ±1 from current page):",
156
        histogram.page_number_font_sizes,
157
        empty_message="(no page number data)",
158
        total_label="Total page numbers",
159
    )
160

161
    # 3. Element IDs (6-7 digit numbers) - calculated third
UNCOV
162
    _print_font_size_distribution(
×
163
        "3. Element ID Font Sizes (6-7 digit numbers):",
164
        histogram.element_id_font_sizes,
165
        empty_message="(no Element ID data)",
166
        total_label="Total Element IDs",
167
    )
168

169
    # 4. Other integer font sizes - calculated fourth
UNCOV
170
    _print_font_size_distribution(
×
171
        "4. Other Integer Font Sizes (integers not matching above patterns):",
172
        histogram.remaining_font_sizes,
173
        max_items=20,
174
        empty_message="(no other integer font size data)",
175
    )
176

177
    # 5. Font name distribution - calculated fifth
178
    print("5. Font Name Distribution:")
×
UNCOV
179
    print("-" * 60)
×
180

UNCOV
181
    font_name_total = sum(histogram.font_name_counts.values())
×
182

183
    if font_name_total > 0:
×
184
        print(f"{'Font Name':<30} | {'Count':>6} | Distribution")
×
UNCOV
185
        print("-" * 60)
×
186

187
        font_names = histogram.font_name_counts.most_common(20)
×
188
        max_count = font_names[0][1] if font_names else 1
×
189
        for font_name, count in font_names:
×
190
            bar_length = int((count / max_count) * 30)
×
191
            bar = "█" * bar_length
×
192
            name_display = font_name[:27] + "..." if len(font_name) > 30 else font_name
×
UNCOV
193
            print(f"{name_display:<30} | {count:6d} | {bar}")
×
194

195
        print("-" * 60)
×
196
        print(f"Total unique fonts:  {len(histogram.font_name_counts)}")
×
UNCOV
197
        print(f"Total text elements: {font_name_total}")
×
198
    else:
UNCOV
199
        print("(no font name data)")
×
200

UNCOV
201
    print()
×
202

203

204
def print_font_hints(hints: FontSizeHints) -> None:
1✔
205
    """Print font size hints extracted from the document.
206

207
    Args:
208
        hints: FontSizeHints containing identified font sizes for different elements
209
    """
210
    print("=== Font Size Hints ===")
×
UNCOV
211
    print()
×
212

UNCOV
213
    def format_size(size: float | None) -> str:
×
214
        """Format a font size for display."""
UNCOV
215
        return f"{size:.1f}pt" if size is not None else "N/A"
×
216

217
    print("Identified font sizes:")
×
218
    print(f"  Part count size:         {format_size(hints.part_count_size)}")
×
219
    print(f"  Catalog part count size: {format_size(hints.catalog_part_count_size)}")
×
220
    print(f"  Step number size:        {format_size(hints.step_number_size)}")
×
221
    print(f"  Step repeat size:        {format_size(hints.step_repeat_size)}")
×
222
    print(f"  Catalog element ID size: {format_size(hints.catalog_element_id_size)}")
×
UNCOV
223
    print(f"  Page number size:        {format_size(hints.page_number_size)}")
×
224

225
    print()
×
226
    print("Remaining font sizes after removing known patterns:")
×
227
    if hints.remaining_font_sizes:
×
228
        print(f"{'Size':>8} | {'Count':>6}")
×
229
        print("-" * 20)
×
230
        for size, count in hints.remaining_font_sizes:
×
231
            print(f"{size:8.1f} | {count:6d}")
×
UNCOV
232
        print(f"\nTotal unique sizes: {len(hints.remaining_font_sizes)}")
×
233
    else:
234
        print("  (no remaining font sizes)")
×
UNCOV
235
    print()
×
236

237

238
def print_classification_debug(
1✔
239
    page: PageData,
240
    result: ClassificationResult,
241
    *,
242
    show_candidates: bool = True,
243
    show_hierarchy: bool = True,
244
    label: str | None = None,
245
) -> None:
246
    """Print comprehensive classification debug information.
247

248
    Shows all classification details in one consolidated view:
249
    - Block hierarchy with labels and removal status
250
    - Detailed candidate analysis (if requested)
251
    - Page hierarchy summary (if requested)
252

253
    Args:
254
        page: PageData containing all elements
255
        result: ClassificationResult with classification information
256
        show_candidates: Include detailed candidate breakdown
257
        show_hierarchy: Include page hierarchy summary
258
        label: If provided, filter candidate analysis to this label only
259
    """
260
    print(f"\n{'=' * 80}")
×
UNCOV
261
    print(f"CLASSIFICATION DEBUG - Page {page.page_number}")
×
262
    print(f"{'=' * 80}\n")
×
263

264
    # Build block hierarchy tree
265
    block_tree = build_hierarchy_from_blocks(page.blocks)
×
266

267
    def print_block(block: Block, depth: int, is_last: bool = True) -> None:
×
268
        """Recursively print a block and its children."""
269
        # Build tree characters
UNCOV
270
        if depth == 0:
×
271
            tree_prefix = ""
×
UNCOV
272
            indent = ""
×
273
        else:
274
            tree_char = "└─" if is_last else "├─"
×
275
            indent = "  " * (depth - 1)
×
276
            tree_prefix = f"{indent}{tree_char} "
×
277

278
        # Base info
279
        is_removed = result.is_removed(block)
×
UNCOV
280
        color = GREY if is_removed else ""
×
281
        reset = RESET if is_removed else ""
×
282

283
        # Build line - get constructed element from winner candidate
UNCOV
284
        elem_str = str(block)
×
285
        label = result.get_label(block)
×
286
        if label:
×
UNCOV
287
            winner = result.get_winner_candidate(block)
×
UNCOV
288
            if winner and winner.constructed:
×
289
                elem_str = str(winner.constructed)
×
290

291
        line = f"{color}{tree_prefix}{block.id:3d} "
×
292

UNCOV
293
        if is_removed:
×
UNCOV
294
            reason = result.get_removal_reason(block)
×
295
            reason_text = reason.reason_type if reason else "unknown"
×
296
            line += f"* REMOVED: {reason_text}"
×
297
            if reason:
×
UNCOV
298
                target = reason.target_block
×
299
                line += f" by {target.id}"
×
300
                target_label = result.get_label(target)
×
301
                if target_label:
×
302
                    line += f" ({target_label})"
×
303
            line += f"* {elem_str}"
×
304
        elif label:
×
305
            line += f"[{label}] {elem_str}"
×
306
        else:
UNCOV
307
            line += f"[no candidates] {elem_str}"
×
308

UNCOV
309
        line += reset
×
UNCOV
310
        print(line)
×
311

312
        # Print children
313
        children = block_tree.get_children(block)
×
314
        sorted_children = sorted(children, key=lambda e: e.id)
×
315
        for i, child in enumerate(sorted_children):
×
UNCOV
316
            child_is_last = i == len(sorted_children) - 1
×
317
            print_block(child, depth + 1, child_is_last)
×
318

319
    # Print root blocks
UNCOV
320
    sorted_roots = sorted(block_tree.roots, key=lambda e: e.id)
×
UNCOV
321
    for root in sorted_roots:
×
322
        print_block(root, 0)
×
323

324
    # Summary stats
325
    total = len(page.blocks)
×
UNCOV
326
    with_labels = sum(1 for b in page.blocks if result.get_label(b) is not None)
×
UNCOV
327
    removed = sum(1 for b in page.blocks if result.is_removed(b))
×
328
    no_candidates = total - with_labels - removed
×
329

UNCOV
330
    print(f"\n{'─' * 80}")
×
UNCOV
331
    print(
×
332
        f"Total: {total} | Winners: {with_labels} | "
333
        f"Removed: {removed} | No candidates: {no_candidates}"
334
    )
335

UNCOV
336
    warnings = result.get_warnings()
×
337
    if warnings:
×
UNCOV
338
        print(f"Warnings: {len(warnings)}")
×
UNCOV
339
        for warning in warnings:
×
340
            print(f"  ⚠ {warning}")
×
341

342
    # Detailed candidate analysis
UNCOV
343
    if show_candidates:
×
344
        print(f"\n{'=' * 80}")
×
345
        print("CANDIDATES BY LABEL")
×
346
        print(f"{'=' * 80}")
×
347

348
        # Get all candidates
349
        all_candidates = result.get_all_candidates()
×
350

351
        # Filter to specific label if requested
UNCOV
352
        if label:
×
UNCOV
353
            labels_to_show = {label: all_candidates.get(label, [])}
×
354
        else:
355
            labels_to_show = all_candidates
×
356

357
        # Summary table
358
        print(f"\n{'Label':<20} {'Total':<8} {'Winners':<8}")
×
359
        print(f"{'-' * 40}")
×
UNCOV
360
        for lbl in sorted(labels_to_show.keys()):
×
361
            candidates = labels_to_show[lbl]
×
UNCOV
362
            winners = [c for c in candidates if c.is_winner]
×
UNCOV
363
            print(f"{lbl:<20} {len(candidates):<8} {len(winners):<8}")
×
364

365
        # Detailed per-label breakdown
UNCOV
366
        for lbl in sorted(labels_to_show.keys()):
×
UNCOV
367
            candidates = labels_to_show[lbl]
×
UNCOV
368
            if not candidates:
×
UNCOV
369
                continue
×
370

371
            winners = [c for c in candidates if c.is_winner]
×
372
            if not winners:
×
373
                continue  # Skip labels with no winners for brevity
×
374

UNCOV
375
            print(f"\n{lbl} ({len(winners)} winner{'s' if len(winners) > 1 else ''}):")
×
UNCOV
376
            for candidate in winners:
×
UNCOV
377
                block = candidate.source_block
×
378
                # Format similar to tree: block_id [label] constructed | source
UNCOV
379
                block_id_str = f"{block.id:3d}" if block else "  ?"
×
UNCOV
380
                constructed_str = str(candidate.constructed)
×
UNCOV
381
                source_str = str(block) if block else "no source"
×
UNCOV
382
                print(
×
383
                    f"  {block_id_str} [{lbl}] {constructed_str} | "
384
                    f"score={candidate.score:.3f} | {source_str}"
385
                )
386

387
    # Page hierarchy
UNCOV
388
    if show_hierarchy:
×
UNCOV
389
        page_obj = result.page
×
UNCOV
390
        if page_obj:
×
UNCOV
391
            print(f"\n{'=' * 80}")
×
UNCOV
392
            print("PAGE HIERARCHY")
×
UNCOV
393
            print(f"{'=' * 80}")
×
UNCOV
394
            page_num_str = (
×
395
                page_obj.page_number.value if page_obj.page_number else "None"
396
            )
UNCOV
397
            print(f"Page number: {page_num_str}")
×
UNCOV
398
            print(f"Progress bar: {'Yes' if page_obj.progress_bar else 'No'}")
×
UNCOV
399
            print(f"Steps: {len(page_obj.steps)}")
×
400

UNCOV
401
            for i, step in enumerate(page_obj.steps, 1):
×
UNCOV
402
                parts_count = len(step.parts_list.parts)
×
UNCOV
403
                print(f"  Step {i}: #{step.step_number.value} ({parts_count} parts)")
×
404

UNCOV
405
    print(f"\n{'=' * 80}\n")
×
406

407

408
def print_label_counts(page: PageData, result: ClassificationResult) -> None:
1✔
409
    """Print label count statistics for a page.
410

411
    Args:
412
        page: PageData containing all elements
413
        result: ClassificationResult with labels
414
    """
415
    label_counts = defaultdict(int)
×
UNCOV
416
    for e in page.blocks:
×
UNCOV
417
        label = result.get_label(e) or "<unknown>"
×
UNCOV
418
        label_counts[label] += 1
×
419

420
    # TODO The following logging shows "defaultdict(<class 'int'>,..." figure
421
    # out how to avoid that.
UNCOV
422
    logger.info(f"Page {page.page_number} Label counts: {label_counts}")
×
423

424

425
def print_page_hierarchy(page_data: PageData, page: Page) -> None:
1✔
426
    """Print the structured LEGO page hierarchy.
427

428
    Args:
429
        page_data: PageData containing the raw page number
430
        page: Structured Page object with steps, parts lists, etc.
431
    """
432
    print(f"Page {page_data.page_number}:")
1✔
433

434
    if page.page_number:
1✔
435
        print(f"  ✓ Page Number: {page.page_number.value}")
1✔
436

437
    if page.steps:
1✔
438
        print(f"  ✓ Steps: {len(page.steps)}")
1✔
439
        for step in page.steps:
1✔
440
            parts_count = len(step.parts_list.parts)
1✔
441
            print(f"    - Step {step.step_number.value} ({parts_count} parts)")
1✔
442
            # Print parts list details
443
            if step.parts_list.parts:
1✔
444
                print("      Parts List:")
1✔
445
                for part in step.parts_list.parts:
1✔
446
                    number_str = part.number.element_id if part.number else "no number"
1✔
447
                    print(f"        • {part.count.count}x ({number_str})")
1✔
448
            else:
449
                print("      Parts List: (empty)")
1✔
450

451
            print(f"      Diagram: {step.diagram.bbox}")
1✔
452

453
    if page.warnings:
1✔
UNCOV
454
        print(f"  ⚠ Warnings: {len(page.warnings)}")
×
UNCOV
455
        for warning in page.warnings:
×
UNCOV
456
            print(f"    - {warning}")
×
457

458
    if page.unprocessed_elements:
1✔
UNCOV
459
        print(f"  ℹ Unprocessed elements: {len(page.unprocessed_elements)}")
×
460

461

462
def build_and_print_page_hierarchy(
1✔
463
    pages: list[PageData], results: list[ClassificationResult]
464
) -> None:
465
    """Build LEGO page hierarchy from classification results and print structure.
466

467
    Args:
468
        pages: List of PageData containing extracted elements
469
        results: List of ClassificationResult with labels and relationships
470
    """
UNCOV
471
    print("Building LEGO page hierarchy...")
×
472

UNCOV
473
    for page_data, result in zip(pages, results, strict=True):
×
UNCOV
474
        page = result.page
×
UNCOV
475
        if page:
×
UNCOV
476
            print_page_hierarchy(page_data, page)
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc