19521244091

Committed 15 Nov 2025 02:10AM UTC coverage: 90.833% (-0.4%) from 91.217%

Build # 19521244091

Build Type

push

github

Committed by

bramp

Commit Message

refactor: remove unused code and simplify domain invariant tests

Removed approximately 220 lines of unused/redundant code from classifier tests:

classifier_rules_test.py:
- Removed ClassifiedPage wrapper class (~110 lines) - never instantiated
- Removed helper functions (_parts_lists, _part_images, _part_counts,
  _print_label_counts) - never called
- Cleaned up unused imports (defaultdict, Block, ClassificationResult)
- Updated docstring to reflect remaining test coverage

domain_invariants_test.py:
- Simplified all 4 tests to use result.page property directly
- Replaced verbose 6-line get_candidates() pattern with simple property access
- Removed redundant isinstance(page, Page) assertions (~48 lines total)
- Tests now more clearly express intent: validate Page/PartsList/Part objects

All tests continue to pass. No functionality was lost.

Run Details

4 of 4 new or added lines in 2 files covered. (100.0%)

151 existing lines in 7 files now uncovered.

4994 of 5498 relevant lines covered (90.83%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

27.31

/src/build_a_long/pdf_extract/cli/reporting.py

"""Reporting and output formatting for PDF extraction."""

import logging
from collections import defaultdict
from typing import Any

from build_a_long.pdf_extract.classifier.classification_result import (
    ClassificationResult,
)
from build_a_long.pdf_extract.classifier.font_size_hints import FontSizeHints
from build_a_long.pdf_extract.classifier.text_histogram import TextHistogram
from build_a_long.pdf_extract.extractor import PageData
from build_a_long.pdf_extract.extractor.hierarchy import build_hierarchy_from_blocks
from build_a_long.pdf_extract.extractor.lego_page_elements import Page
from build_a_long.pdf_extract.extractor.page_blocks import Block

logger = logging.getLogger(__name__)

# ANSI color codes
GREY = "\033[90m"
RESET = "\033[0m"


def print_summary(
    pages: list[PageData],
    results: list[ClassificationResult],
    *,
    detailed: bool = False,
) -> None:
    """Print a human-readable summary of classification results to stdout.

    Args:
        pages: List of PageData containing extracted elements
        results: List of ClassificationResult with labels
        detailed: If True, include additional details like missing page numbers
    """
    total_pages = len(pages)
    total_blocks = 0
    blocks_by_type: dict[str, int] = {}
    labeled_counts: dict[str, int] = {}

    pages_with_page_number = 0
    missing_page_numbers: list[int] = []

    for page, result in zip(pages, results, strict=True):
        total_blocks += len(page.blocks)
        # Tally block types and labels
        has_page_number = False
        for block in page.blocks:
            t = block.__class__.__name__.lower()
            blocks_by_type[t] = blocks_by_type.get(t, 0) + 1

            label = result.get_label(block)
            if label:
                labeled_counts[label] = labeled_counts.get(label, 0) + 1
                if label == "page_number":
                    has_page_number = True

        if has_page_number:
            pages_with_page_number += 1
        else:
            missing_page_numbers.append(page.page_number)

    coverage = (pages_with_page_number / total_pages * 100.0) if total_pages else 0.0

    # Human-friendly, single-shot summary
    print("=== Classification summary ===")
    print(f"Pages processed: {total_pages}")
    print(f"Total blocks: {total_blocks}")
    if blocks_by_type:
        parts = [f"{k}={v}" for k, v in sorted(blocks_by_type.items())]
        print("Elements by type: " + ", ".join(parts))
    if labeled_counts:
        parts = [f"{k}={v}" for k, v in sorted(labeled_counts.items())]
        print("Labeled elements: " + ", ".join(parts))
    print(
        f"Page-number coverage: {pages_with_page_number}/{total_pages} "
        f"({coverage:.1f}%)"
    )
    if detailed and missing_page_numbers:
        sample = ", ".join(str(n) for n in missing_page_numbers[:20])
        more = " ..." if len(missing_page_numbers) > 20 else ""
        print(f"Pages missing page number: {sample}{more}")


def _print_font_size_distribution(
    title: str,
    counter: Any,
    *,
    max_items: int = 10,
    empty_message: str = "(no data)",
    total_label: str = "Total text elements",
    unique_label: str = "Total unique sizes",
) -> None:
    """Print a font size distribution with bar chart.

    Args:
        title: Section title to display
        counter: Counter/dict mapping font sizes to counts
        max_items: Maximum number of items to display
        empty_message: Message to show when counter is empty
        total_label: Label for total count summary
        unique_label: Label for unique size count
    """
    print(title)
    print("-" * 60)

    total = sum(counter.values())

    if total > 0:
        print(f"{'Size':>8} | {'Count':>6} | Distribution")
        print("-" * 60)

        # Get most common items
        if hasattr(counter, "most_common"):
            items = counter.most_common(max_items)
        else:
            items = sorted(counter.items(), key=lambda x: x[1], reverse=True)[
                :max_items
            ]

        max_count = items[0][1] if items else 1
        for size, count in items:
            bar_length = int((count / max_count) * 30)
            bar = "█" * bar_length
            print(f"{size:8.1f} | {count:6d} | {bar}")

        print("-" * 60)
        print(f"{unique_label}: {len(counter)}")
        print(f"{total_label}: {total}")
    else:
        print(empty_message)
    print()


def print_histogram(histogram: TextHistogram) -> None:
    """Print the text histogram showing font size and name distributions.

    Args:
        histogram: TextHistogram containing font statistics across all pages
    """
    print("=== Text Histogram ===")
    print()

    # 1. Part counts (\dx pattern) - calculated first
    _print_font_size_distribution(
        "1. Part Count Font Sizes (\\dx pattern, e.g., '2x', '3x'):",
        histogram.part_count_font_sizes,
        empty_message="(no part count data)",
        total_label="Total part counts",
    )

    # 2. Page numbers (±1) - calculated second
    _print_font_size_distribution(
        "2. Page Number Font Sizes (digits ±1 from current page):",
        histogram.page_number_font_sizes,
        empty_message="(no page number data)",
        total_label="Total page numbers",
    )

    # 3. Element IDs (6-7 digit numbers) - calculated third
    _print_font_size_distribution(
        "3. Element ID Font Sizes (6-7 digit numbers):",
        histogram.element_id_font_sizes,
        empty_message="(no Element ID data)",
        total_label="Total Element IDs",
    )

    # 4. Other integer font sizes - calculated fourth
    _print_font_size_distribution(
        "4. Other Integer Font Sizes (integers not matching above patterns):",
        histogram.remaining_font_sizes,
        max_items=20,
        empty_message="(no other integer font size data)",
    )

    # 5. Font name distribution - calculated fifth
    print("5. Font Name Distribution:")
    print("-" * 60)

    font_name_total = sum(histogram.font_name_counts.values())

    if font_name_total > 0:
        print(f"{'Font Name':<30} | {'Count':>6} | Distribution")
        print("-" * 60)

        font_names = histogram.font_name_counts.most_common(20)
        max_count = font_names[0][1] if font_names else 1
        for font_name, count in font_names:
            bar_length = int((count / max_count) * 30)
            bar = "█" * bar_length
            name_display = font_name[:27] + "..." if len(font_name) > 30 else font_name
            print(f"{name_display:<30} | {count:6d} | {bar}")

        print("-" * 60)
        print(f"Total unique fonts:  {len(histogram.font_name_counts)}")
        print(f"Total text elements: {font_name_total}")
    else:
        print("(no font name data)")

    print()


def print_font_hints(hints: FontSizeHints) -> None:
    """Print font size hints extracted from the document.

    Args:
        hints: FontSizeHints containing identified font sizes for different elements
    """
    print("=== Font Size Hints ===")
    print()

    def format_size(size: float | None) -> str:
        """Format a font size for display."""
        return f"{size:.1f}pt" if size is not None else "N/A"

    print("Identified font sizes:")
    print(f"  Part count size:         {format_size(hints.part_count_size)}")
    print(f"  Catalog part count size: {format_size(hints.catalog_part_count_size)}")
    print(f"  Step number size:        {format_size(hints.step_number_size)}")
    print(f"  Step repeat size:        {format_size(hints.step_repeat_size)}")
    print(f"  Catalog element ID size: {format_size(hints.catalog_element_id_size)}")
    print(f"  Page number size:        {format_size(hints.page_number_size)}")

    print()
    print("Remaining font sizes after removing known patterns:")
    if hints.remaining_font_sizes:
        print(f"{'Size':>8} | {'Count':>6}")
        print("-" * 20)
        for size, count in hints.remaining_font_sizes:
            print(f"{size:8.1f} | {count:6d}")
        print(f"\nTotal unique sizes: {len(hints.remaining_font_sizes)}")
    else:
        print("  (no remaining font sizes)")
    print()


def print_classification_debug(
    page: PageData,
    result: ClassificationResult,
    *,
    show_candidates: bool = True,
    show_hierarchy: bool = True,
    label: str | None = None,
) -> None:
    """Print comprehensive classification debug information.

    Shows all classification details in one consolidated view:
    - Block hierarchy with labels and removal status
    - Detailed candidate analysis (if requested)
    - Page hierarchy summary (if requested)

    Args:
        page: PageData containing all elements
        result: ClassificationResult with classification information
        show_candidates: Include detailed candidate breakdown
        show_hierarchy: Include page hierarchy summary
        label: If provided, filter candidate analysis to this label only
    """
    print(f"\n{'=' * 80}")
    print(f"CLASSIFICATION DEBUG - Page {page.page_number}")
    print(f"{'=' * 80}\n")

    # Build block hierarchy tree
    block_tree = build_hierarchy_from_blocks(page.blocks)

    def print_block(block: Block, depth: int, is_last: bool = True) -> None:
        """Recursively print a block and its children."""
        # Build tree characters
        if depth == 0:
            tree_prefix = ""
            indent = ""
        else:
            tree_char = "└─" if is_last else "├─"
            indent = "  " * (depth - 1)
            tree_prefix = f"{indent}{tree_char} "

        # Base info
        is_removed = result.is_removed(block)
        color = GREY if is_removed else ""
        reset = RESET if is_removed else ""

        # Build line - get constructed element from winner candidate
        elem_str = str(block)
        label = result.get_label(block)
        if label:
            winner = result.get_winner_candidate(block)
            if winner and winner.constructed:
                elem_str = str(winner.constructed)

        line = f"{color}{tree_prefix}{block.id:3d} "

        if is_removed:
            reason = result.get_removal_reason(block)
            reason_text = reason.reason_type if reason else "unknown"
            line += f"* REMOVED: {reason_text}"
            if reason:
                target = reason.target_block
                line += f" by {target.id}"
                target_label = result.get_label(target)
                if target_label:
                    line += f" ({target_label})"
            line += f"* {elem_str}"
        elif label:
            line += f"[{label}] {elem_str}"
        else:
            line += f"[no candidates] {elem_str}"

        line += reset
        print(line)

        # Print children
        children = block_tree.get_children(block)
        sorted_children = sorted(children, key=lambda e: e.id)
        for i, child in enumerate(sorted_children):
            child_is_last = i == len(sorted_children) - 1
            print_block(child, depth + 1, child_is_last)

    # Print root blocks
    sorted_roots = sorted(block_tree.roots, key=lambda e: e.id)
    for root in sorted_roots:
        print_block(root, 0)

    # Summary stats
    total = len(page.blocks)
    with_labels = sum(1 for b in page.blocks if result.get_label(b) is not None)
    removed = sum(1 for b in page.blocks if result.is_removed(b))
    no_candidates = total - with_labels - removed

    print(f"\n{'─' * 80}")
    print(
        f"Total: {total} | Winners: {with_labels} | "
        f"Removed: {removed} | No candidates: {no_candidates}"
    )

    warnings = result.get_warnings()
    if warnings:
        print(f"Warnings: {len(warnings)}")
        for warning in warnings:
            print(f"  ⚠ {warning}")

    # Detailed candidate analysis
    if show_candidates:
        print(f"\n{'=' * 80}")
        print("CANDIDATES BY LABEL")
        print(f"{'=' * 80}")

        # Get all candidates
        all_candidates = result.get_all_candidates()

        # Filter to specific label if requested
        if label:
            labels_to_show = {label: all_candidates.get(label, [])}
        else:
            labels_to_show = all_candidates

        # Summary table
        print(f"\n{'Label':<20} {'Total':<8} {'Winners':<8}")
        print(f"{'-' * 40}")
        for lbl in sorted(labels_to_show.keys()):
            candidates = labels_to_show[lbl]
            winners = [c for c in candidates if c.is_winner]
            print(f"{lbl:<20} {len(candidates):<8} {len(winners):<8}")

        # Detailed per-label breakdown
        for lbl in sorted(labels_to_show.keys()):
            candidates = labels_to_show[lbl]
            if not candidates:
                continue

            winners = [c for c in candidates if c.is_winner]
            if not winners:
                continue  # Skip labels with no winners for brevity

            print(f"\n{lbl} ({len(winners)} winner{'s' if len(winners) > 1 else ''}):")
            for candidate in winners:
                block = candidate.source_block
                # Format similar to tree: block_id [label] constructed | source
                block_id_str = f"{block.id:3d}" if block else "  ?"
                constructed_str = str(candidate.constructed)
                source_str = str(block) if block else "no source"
                print(
                    f"  {block_id_str} [{lbl}] {constructed_str} | "
                    f"score={candidate.score:.3f} | {source_str}"
                )

    # Page hierarchy
    if show_hierarchy:
        page_obj = result.page
        if page_obj:
            print(f"\n{'=' * 80}")
            print("PAGE HIERARCHY")
            print(f"{'=' * 80}")
            page_num_str = (
                page_obj.page_number.value if page_obj.page_number else "None"
            )
            print(f"Page number: {page_num_str}")
            print(f"Progress bar: {'Yes' if page_obj.progress_bar else 'No'}")
            print(f"Steps: {len(page_obj.steps)}")

            for i, step in enumerate(page_obj.steps, 1):
                parts_count = len(step.parts_list.parts)
                print(f"  Step {i}: #{step.step_number.value} ({parts_count} parts)")

    print(f"\n{'=' * 80}\n")


def print_label_counts(page: PageData, result: ClassificationResult) -> None:
    """Print label count statistics for a page.

    Args:
        page: PageData containing all elements
        result: ClassificationResult with labels
    """
    label_counts = defaultdict(int)
    for e in page.blocks:
        label = result.get_label(e) or "<unknown>"
        label_counts[label] += 1

    # TODO The following logging shows "defaultdict(<class 'int'>,..." figure
    # out how to avoid that.
    logger.info(f"Page {page.page_number} Label counts: {label_counts}")


def print_page_hierarchy(page_data: PageData, page: Page) -> None:
    """Print the structured LEGO page hierarchy.

    Args:
        page_data: PageData containing the raw page number
        page: Structured Page object with steps, parts lists, etc.
    """
    print(f"Page {page_data.page_number}:")

    if page.page_number:
        print(f"  ✓ Page Number: {page.page_number.value}")

    if page.steps:
        print(f"  ✓ Steps: {len(page.steps)}")
        for step in page.steps:
            parts_count = len(step.parts_list.parts)
            print(f"    - Step {step.step_number.value} ({parts_count} parts)")
            # Print parts list details
            if step.parts_list.parts:
                print("      Parts List:")
                for part in step.parts_list.parts:
                    number_str = part.number.element_id if part.number else "no number"
                    print(f"        • {part.count.count}x ({number_str})")
            else:
                print("      Parts List: (empty)")

            print(f"      Diagram: {step.diagram.bbox}")

    if page.warnings:
        print(f"  ⚠ Warnings: {len(page.warnings)}")
        for warning in page.warnings:
            print(f"    - {warning}")

    if page.unprocessed_elements:
        print(f"  ℹ Unprocessed elements: {len(page.unprocessed_elements)}")


def build_and_print_page_hierarchy(
    pages: list[PageData], results: list[ClassificationResult]
) -> None:
    """Build LEGO page hierarchy from classification results and print structure.

    Args:
        pages: List of PageData containing extracted elements
        results: List of ClassificationResult with labels and relationships
    """
    print("Building LEGO page hierarchy...")

    for page_data, result in zip(pages, results, strict=True):
        page = result.page
        if page:
            print_page_hierarchy(page_data, page)

1	"""Reporting and output formatting for PDF extraction."""
2
3	import logging	1✔
4	from collections import defaultdict	1✔
5	from typing import Any	1✔
6
7	from build_a_long.pdf_extract.classifier.classification_result import (	1✔
8	ClassificationResult,
9	)
10	from build_a_long.pdf_extract.classifier.font_size_hints import FontSizeHints	1✔
11	from build_a_long.pdf_extract.classifier.text_histogram import TextHistogram	1✔
12	from build_a_long.pdf_extract.extractor import PageData	1✔
13	from build_a_long.pdf_extract.extractor.hierarchy import build_hierarchy_from_blocks	1✔
14	from build_a_long.pdf_extract.extractor.lego_page_elements import Page	1✔
15	from build_a_long.pdf_extract.extractor.page_blocks import Block	1✔
16
17	logger = logging.getLogger(__name__)	1✔
18
19	# ANSI color codes
20	GREY = "\033[90m"	1✔
21	RESET = "\033[0m"	1✔
22
23
24	def print_summary(	1✔
25	pages: list[PageData],
26	results: list[ClassificationResult],
27	*,
28	detailed: bool = False,
29	) -> None:
30	"""Print a human-readable summary of classification results to stdout.
31
32	Args:
33	pages: List of PageData containing extracted elements
34	results: List of ClassificationResult with labels
35	detailed: If True, include additional details like missing page numbers
36	"""
37	total_pages = len(pages)	1✔
38	total_blocks = 0	1✔
39	blocks_by_type: dict[str, int] = {}	1✔
40	labeled_counts: dict[str, int] = {}	1✔
41
42	pages_with_page_number = 0	1✔
43	missing_page_numbers: list[int] = []	1✔
44
45	for page, result in zip(pages, results, strict=True):	1✔
46	total_blocks += len(page.blocks)	1✔
47	# Tally block types and labels
48	has_page_number = False	1✔
49	for block in page.blocks:	1✔
50	t = block.__class__.__name__.lower()	1✔
51	blocks_by_type[t] = blocks_by_type.get(t, 0) + 1	1✔
52
53	label = result.get_label(block)	1✔
54	if label:	1✔
55	labeled_counts[label] = labeled_counts.get(label, 0) + 1	1✔
56	if label == "page_number":	1✔
UNCOV 57	has_page_number = True	×
58
59	if has_page_number:	1✔
UNCOV 60	pages_with_page_number += 1	×
61	else:
62	missing_page_numbers.append(page.page_number)	1✔
63
64	coverage = (pages_with_page_number / total_pages * 100.0) if total_pages else 0.0	1✔
65
66	# Human-friendly, single-shot summary
67	print("=== Classification summary ===")	1✔
68	print(f"Pages processed: {total_pages}")	1✔
69	print(f"Total blocks: {total_blocks}")	1✔
70	if blocks_by_type:	1✔
71	parts = [f"{k}={v}" for k, v in sorted(blocks_by_type.items())]	1✔
72	print("Elements by type: " + ", ".join(parts))	1✔
73	if labeled_counts:	1✔
74	parts = [f"{k}={v}" for k, v in sorted(labeled_counts.items())]	1✔
75	print("Labeled elements: " + ", ".join(parts))	1✔
76	print(	1✔
77	f"Page-number coverage: {pages_with_page_number}/{total_pages} "
78	f"({coverage:.1f}%)"
79	)
80	if detailed and missing_page_numbers:	1✔
81	sample = ", ".join(str(n) for n in missing_page_numbers[:20])	×
82	more = " ..." if len(missing_page_numbers) > 20 else ""	×
UNCOV 83	print(f"Pages missing page number: {sample}{more}")	×
84
85
86	def _print_font_size_distribution(	1✔
87	title: str,
88	counter: Any,
89	*,
90	max_items: int = 10,
91	empty_message: str = "(no data)",
92	total_label: str = "Total text elements",
93	unique_label: str = "Total unique sizes",
94	) -> None:
95	"""Print a font size distribution with bar chart.
96
97	Args:
98	title: Section title to display
99	counter: Counter/dict mapping font sizes to counts
100	max_items: Maximum number of items to display
101	empty_message: Message to show when counter is empty
102	total_label: Label for total count summary
103	unique_label: Label for unique size count
104	"""
105	print(title)	×
UNCOV 106	print("-" * 60)	×
107
UNCOV 108	total = sum(counter.values())	×
109
110	if total > 0:	×
111	print(f"{'Size':>8} \| {'Count':>6} \| Distribution")	×
UNCOV 112	print("-" * 60)	×
113
114	# Get most common items
115	if hasattr(counter, "most_common"):	×
UNCOV 116	items = counter.most_common(max_items)	×
117	else:
UNCOV 118	items = sorted(counter.items(), key=lambda x: x[1], reverse=True)[	×
119	:max_items
120	]
121
122	max_count = items[0][1] if items else 1	×
123	for size, count in items:	×
124	bar_length = int((count / max_count) * 30)	×
125	bar = "█" * bar_length	×
UNCOV 126	print(f"{size:8.1f} \| {count:6d} \| {bar}")	×
127
128	print("-" * 60)	×
129	print(f"{unique_label}: {len(counter)}")	×
UNCOV 130	print(f"{total_label}: {total}")	×
131	else:
132	print(empty_message)	×
UNCOV 133	print()	×
134
135
136	def print_histogram(histogram: TextHistogram) -> None:	1✔
137	"""Print the text histogram showing font size and name distributions.
138
139	Args:
140	histogram: TextHistogram containing font statistics across all pages
141	"""
142	print("=== Text Histogram ===")	×
UNCOV 143	print()	×
144
145	# 1. Part counts (\dx pattern) - calculated first
UNCOV 146	_print_font_size_distribution(	×
147	"1. Part Count Font Sizes (\\dx pattern, e.g., '2x', '3x'):",
148	histogram.part_count_font_sizes,
149	empty_message="(no part count data)",
150	total_label="Total part counts",
151	)
152
153	# 2. Page numbers (±1) - calculated second
UNCOV 154	_print_font_size_distribution(	×
155	"2. Page Number Font Sizes (digits ±1 from current page):",
156	histogram.page_number_font_sizes,
157	empty_message="(no page number data)",
158	total_label="Total page numbers",
159	)
160
161	# 3. Element IDs (6-7 digit numbers) - calculated third
UNCOV 162	_print_font_size_distribution(	×
163	"3. Element ID Font Sizes (6-7 digit numbers):",
164	histogram.element_id_font_sizes,
165	empty_message="(no Element ID data)",
166	total_label="Total Element IDs",
167	)
168
169	# 4. Other integer font sizes - calculated fourth
UNCOV 170	_print_font_size_distribution(	×
171	"4. Other Integer Font Sizes (integers not matching above patterns):",
172	histogram.remaining_font_sizes,
173	max_items=20,
174	empty_message="(no other integer font size data)",
175	)
176
177	# 5. Font name distribution - calculated fifth
178	print("5. Font Name Distribution:")	×
UNCOV 179	print("-" * 60)	×
180
UNCOV 181	font_name_total = sum(histogram.font_name_counts.values())	×
182
183	if font_name_total > 0:	×
184	print(f"{'Font Name':<30} \| {'Count':>6} \| Distribution")	×
UNCOV 185	print("-" * 60)	×
186
187	font_names = histogram.font_name_counts.most_common(20)	×
188	max_count = font_names[0][1] if font_names else 1	×
189	for font_name, count in font_names:	×
190	bar_length = int((count / max_count) * 30)	×
191	bar = "█" * bar_length	×
192	name_display = font_name[:27] + "..." if len(font_name) > 30 else font_name	×
UNCOV 193	print(f"{name_display:<30} \| {count:6d} \| {bar}")	×
194
195	print("-" * 60)	×
196	print(f"Total unique fonts: {len(histogram.font_name_counts)}")	×
UNCOV 197	print(f"Total text elements: {font_name_total}")	×
198	else:
UNCOV 199	print("(no font name data)")	×
200
UNCOV 201	print()	×
202
203
204	def print_font_hints(hints: FontSizeHints) -> None:	1✔
205	"""Print font size hints extracted from the document.
206
207	Args:
208	hints: FontSizeHints containing identified font sizes for different elements
209	"""
210	print("=== Font Size Hints ===")	×
UNCOV 211	print()	×
212
UNCOV 213	def format_size(size: float \| None) -> str:	×
214	"""Format a font size for display."""
UNCOV 215	return f"{size:.1f}pt" if size is not None else "N/A"	×
216
217	print("Identified font sizes:")	×
218	print(f" Part count size: {format_size(hints.part_count_size)}")	×
219	print(f" Catalog part count size: {format_size(hints.catalog_part_count_size)}")	×
220	print(f" Step number size: {format_size(hints.step_number_size)}")	×
221	print(f" Step repeat size: {format_size(hints.step_repeat_size)}")	×
222	print(f" Catalog element ID size: {format_size(hints.catalog_element_id_size)}")	×
UNCOV 223	print(f" Page number size: {format_size(hints.page_number_size)}")	×
224
225	print()	×
226	print("Remaining font sizes after removing known patterns:")	×
227	if hints.remaining_font_sizes:	×
228	print(f"{'Size':>8} \| {'Count':>6}")	×
229	print("-" * 20)	×
230	for size, count in hints.remaining_font_sizes:	×
231	print(f"{size:8.1f} \| {count:6d}")	×
UNCOV 232	print(f"\nTotal unique sizes: {len(hints.remaining_font_sizes)}")	×
233	else:
234	print(" (no remaining font sizes)")	×
UNCOV 235	print()	×
236
237
238	def print_classification_debug(	1✔
239	page: PageData,
240	result: ClassificationResult,
241	*,
242	show_candidates: bool = True,
243	show_hierarchy: bool = True,
244	label: str \| None = None,
245	) -> None:
246	"""Print comprehensive classification debug information.
247
248	Shows all classification details in one consolidated view:
249	- Block hierarchy with labels and removal status
250	- Detailed candidate analysis (if requested)
251	- Page hierarchy summary (if requested)
252
253	Args:
254	page: PageData containing all elements
255	result: ClassificationResult with classification information
256	show_candidates: Include detailed candidate breakdown
257	show_hierarchy: Include page hierarchy summary
258	label: If provided, filter candidate analysis to this label only
259	"""
260	print(f"\n{'=' * 80}")	×
UNCOV 261	print(f"CLASSIFICATION DEBUG - Page {page.page_number}")	×
262	print(f"{'=' * 80}\n")	×
263
264	# Build block hierarchy tree
265	block_tree = build_hierarchy_from_blocks(page.blocks)	×
266
267	def print_block(block: Block, depth: int, is_last: bool = True) -> None:	×
268	"""Recursively print a block and its children."""
269	# Build tree characters
UNCOV 270	if depth == 0:	×
271	tree_prefix = ""	×
UNCOV 272	indent = ""	×
273	else:
274	tree_char = "└─" if is_last else "├─"	×
275	indent = " " * (depth - 1)	×
276	tree_prefix = f"{indent}{tree_char} "	×
277
278	# Base info
279	is_removed = result.is_removed(block)	×
UNCOV 280	color = GREY if is_removed else ""	×
281	reset = RESET if is_removed else ""	×
282
283	# Build line - get constructed element from winner candidate
UNCOV 284	elem_str = str(block)	×
285	label = result.get_label(block)	×
286	if label:	×
UNCOV 287	winner = result.get_winner_candidate(block)	×
UNCOV 288	if winner and winner.constructed:	×
289	elem_str = str(winner.constructed)	×
290
291	line = f"{color}{tree_prefix}{block.id:3d} "	×
292
UNCOV 293	if is_removed:	×
UNCOV 294	reason = result.get_removal_reason(block)	×
295	reason_text = reason.reason_type if reason else "unknown"	×
296	line += f"* REMOVED: {reason_text}"	×
297	if reason:	×
UNCOV 298	target = reason.target_block	×
299	line += f" by {target.id}"	×
300	target_label = result.get_label(target)	×
301	if target_label:	×
302	line += f" ({target_label})"	×
303	line += f"* {elem_str}"	×
304	elif label:	×
305	line += f"[{label}] {elem_str}"	×
306	else:
UNCOV 307	line += f"[no candidates] {elem_str}"	×
308
UNCOV 309	line += reset	×
UNCOV 310	print(line)	×
311
312	# Print children
313	children = block_tree.get_children(block)	×
314	sorted_children = sorted(children, key=lambda e: e.id)	×
315	for i, child in enumerate(sorted_children):	×
UNCOV 316	child_is_last = i == len(sorted_children) - 1	×
317	print_block(child, depth + 1, child_is_last)	×
318
319	# Print root blocks
UNCOV 320	sorted_roots = sorted(block_tree.roots, key=lambda e: e.id)	×
UNCOV 321	for root in sorted_roots:	×
322	print_block(root, 0)	×
323
324	# Summary stats
325	total = len(page.blocks)	×
UNCOV 326	with_labels = sum(1 for b in page.blocks if result.get_label(b) is not None)	×
UNCOV 327	removed = sum(1 for b in page.blocks if result.is_removed(b))	×
328	no_candidates = total - with_labels - removed	×
329
UNCOV 330	print(f"\n{'─' * 80}")	×
UNCOV 331	print(	×
332	f"Total: {total} \| Winners: {with_labels} \| "
333	f"Removed: {removed} \| No candidates: {no_candidates}"
334	)
335
UNCOV 336	warnings = result.get_warnings()	×
337	if warnings:	×
UNCOV 338	print(f"Warnings: {len(warnings)}")	×
UNCOV 339	for warning in warnings:	×
340	print(f" ⚠ {warning}")	×
341
342	# Detailed candidate analysis
UNCOV 343	if show_candidates:	×
344	print(f"\n{'=' * 80}")	×
345	print("CANDIDATES BY LABEL")	×
346	print(f"{'=' * 80}")	×
347
348	# Get all candidates
349	all_candidates = result.get_all_candidates()	×
350
351	# Filter to specific label if requested
UNCOV 352	if label:	×
UNCOV 353	labels_to_show = {label: all_candidates.get(label, [])}	×
354	else:
355	labels_to_show = all_candidates	×
356
357	# Summary table
358	print(f"\n{'Label':<20} {'Total':<8} {'Winners':<8}")	×
359	print(f"{'-' * 40}")	×
UNCOV 360	for lbl in sorted(labels_to_show.keys()):	×
361	candidates = labels_to_show[lbl]	×
UNCOV 362	winners = [c for c in candidates if c.is_winner]	×
UNCOV 363	print(f"{lbl:<20} {len(candidates):<8} {len(winners):<8}")	×
364
365	# Detailed per-label breakdown
UNCOV 366	for lbl in sorted(labels_to_show.keys()):	×
UNCOV 367	candidates = labels_to_show[lbl]	×
UNCOV 368	if not candidates:	×
UNCOV 369	continue	×
370
371	winners = [c for c in candidates if c.is_winner]	×
372	if not winners:	×
373	continue # Skip labels with no winners for brevity	×
374
UNCOV 375	print(f"\n{lbl} ({len(winners)} winner{'s' if len(winners) > 1 else ''}):")	×
UNCOV 376	for candidate in winners:	×
UNCOV 377	block = candidate.source_block	×
378	# Format similar to tree: block_id [label] constructed \| source
UNCOV 379	block_id_str = f"{block.id:3d}" if block else " ?"	×
UNCOV 380	constructed_str = str(candidate.constructed)	×
UNCOV 381	source_str = str(block) if block else "no source"	×
UNCOV 382	print(	×
383	f" {block_id_str} [{lbl}] {constructed_str} \| "
384	f"score={candidate.score:.3f} \| {source_str}"
385	)
386
387	# Page hierarchy
UNCOV 388	if show_hierarchy:	×
UNCOV 389	page_obj = result.page	×
UNCOV 390	if page_obj:	×
UNCOV 391	print(f"\n{'=' * 80}")	×
UNCOV 392	print("PAGE HIERARCHY")	×
UNCOV 393	print(f"{'=' * 80}")	×
UNCOV 394	page_num_str = (	×
395	page_obj.page_number.value if page_obj.page_number else "None"
396	)
UNCOV 397	print(f"Page number: {page_num_str}")	×
UNCOV 398	print(f"Progress bar: {'Yes' if page_obj.progress_bar else 'No'}")	×
UNCOV 399	print(f"Steps: {len(page_obj.steps)}")	×
400
UNCOV 401	for i, step in enumerate(page_obj.steps, 1):	×
UNCOV 402	parts_count = len(step.parts_list.parts)	×
UNCOV 403	print(f" Step {i}: #{step.step_number.value} ({parts_count} parts)")	×
404
UNCOV 405	print(f"\n{'=' * 80}\n")	×
406
407
408	def print_label_counts(page: PageData, result: ClassificationResult) -> None:	1✔
409	"""Print label count statistics for a page.
410
411	Args:
412	page: PageData containing all elements
413	result: ClassificationResult with labels
414	"""
415	label_counts = defaultdict(int)	×
UNCOV 416	for e in page.blocks:	×
UNCOV 417	label = result.get_label(e) or "<unknown>"	×
UNCOV 418	label_counts[label] += 1	×
419
420	# TODO The following logging shows "defaultdict(<class 'int'>,..." figure
421	# out how to avoid that.
UNCOV 422	logger.info(f"Page {page.page_number} Label counts: {label_counts}")	×
423
424
425	def print_page_hierarchy(page_data: PageData, page: Page) -> None:	1✔
426	"""Print the structured LEGO page hierarchy.
427
428	Args:
429	page_data: PageData containing the raw page number
430	page: Structured Page object with steps, parts lists, etc.
431	"""
432	print(f"Page {page_data.page_number}:")	1✔
433
434	if page.page_number:	1✔
435	print(f" ✓ Page Number: {page.page_number.value}")	1✔
436
437	if page.steps:	1✔
438	print(f" ✓ Steps: {len(page.steps)}")	1✔
439	for step in page.steps:	1✔
440	parts_count = len(step.parts_list.parts)	1✔
441	print(f" - Step {step.step_number.value} ({parts_count} parts)")	1✔
442	# Print parts list details
443	if step.parts_list.parts:	1✔
444	print(" Parts List:")	1✔
445	for part in step.parts_list.parts:	1✔
446	number_str = part.number.element_id if part.number else "no number"	1✔
447	print(f" • {part.count.count}x ({number_str})")	1✔
448	else:
449	print(" Parts List: (empty)")	1✔
450
451	print(f" Diagram: {step.diagram.bbox}")	1✔
452
453	if page.warnings:	1✔
UNCOV 454	print(f" ⚠ Warnings: {len(page.warnings)}")	×
UNCOV 455	for warning in page.warnings:	×
UNCOV 456	print(f" - {warning}")	×
457
458	if page.unprocessed_elements:	1✔
UNCOV 459	print(f" ℹ Unprocessed elements: {len(page.unprocessed_elements)}")	×
460
461
462	def build_and_print_page_hierarchy(	1✔
463	pages: list[PageData], results: list[ClassificationResult]
464	) -> None:
465	"""Build LEGO page hierarchy from classification results and print structure.
466
467	Args:
468	pages: List of PageData containing extracted elements
469	results: List of ClassificationResult with labels and relationships
470	"""
UNCOV 471	print("Building LEGO page hierarchy...")	×
472
UNCOV 473	for page_data, result in zip(pages, results, strict=True):	×
UNCOV 474	page = result.page	×
UNCOV 475	if page:	×
UNCOV 476	print_page_hierarchy(page_data, page)	×

bramp / build-along / 19521244091

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous