20400711546

Committed 20 Dec 2025 10:09PM UTC coverage: 89.367% (+0.006%) from 89.361%

Build # 20400711546

Build Type

push

github

Committed by

bramp

Commit Message

docs: Add comprehensive Classifier best practices documentation

- Add detailed docstrings to Classifier and RuleBasedClassifier classes
  covering all aspects of writing robust classifiers
- Document scoring phase: API access rules, Score object design,
  intrinsic vs relationship-based scoring
- Document build phase: source block rules, exception handling,
  construction patterns
- Document build_all(): when to use for global coordination
- Add complete code examples for atomic and composite patterns
- Fix DESIGN.md contradiction about Score objects storing candidates
- Update README.md and DESIGN.md to reference class docstrings as
  single source of truth
- Add recommendations to use RuleBasedClassifier for atomic classifiers

This consolidates documentation to reduce duplication and provides
clear guidelines for both humans and AI agents writing new classifiers.

Run Details

13708 of 15339 relevant lines covered (89.37%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

26.09

/src/build_a_long/pdf_extract/cli/unconsumed_diagnostics.py

"""Diagnostic utilities for analyzing unconsumed blocks.

This module provides tools to categorize and explain why blocks weren't consumed
by any LEGO page element. It helps identify patterns in unconsumed blocks and
provides actionable recommendations.
"""

from __future__ import annotations

from collections import defaultdict
from dataclasses import dataclass
from enum import Enum, auto

from build_a_long.pdf_extract.classifier.classification_result import (
    ClassificationResult,
)
from build_a_long.pdf_extract.extractor.extractor import PageData
from build_a_long.pdf_extract.extractor.page_blocks import Blocks, Drawing, Image, Text


class UnconsumedCategory(Enum):
    """Categories of unconsumed blocks."""

    ZERO_WIDTH = auto()
    """Drawing with zero width (x0 == x1)"""

    ZERO_HEIGHT = auto()
    """Drawing with zero height (y0 == y1)"""

    PAGE_EDGE_LINE = auto()
    """Line at page boundary (x=0 or x=page_width)"""

    WHITESPACE_TEXT = auto()
    """Text containing only whitespace"""

    COPYRIGHT_TEXT = auto()
    """Copyright or trademark text on info pages"""

    SMALL_DOT = auto()
    """Very small drawing (likely a dot or artifact)"""

    LARGE_UNCLASSIFIED = auto()
    """Large drawing that didn't match any classifier"""

    IMAGE_IN_COMPLEX_PAGE = auto()
    """Image on catalog/info page that wasn't assigned"""

    UNKNOWN = auto()
    """Block that doesn't fit known categories"""


@dataclass
class UnconsumedBlockInfo:
    """Information about an unconsumed block."""

    block: Blocks
    category: UnconsumedCategory
    reason: str
    recommendation: str


def categorize_unconsumed_block(
    block: Blocks,
    page_data: PageData,
) -> UnconsumedBlockInfo:
    """Categorize an unconsumed block and provide actionable information.

    Args:
        block: The unconsumed block
        page_data: Page data for context (page dimensions, etc.)

    Returns:
        UnconsumedBlockInfo with category, reason, and recommendation
    """
    page_width = page_data.bbox.width
    page_height = page_data.bbox.height

    # Check for zero-dimension drawings
    if isinstance(block, Drawing):
        bbox = block.bbox
        width = bbox.width
        height = bbox.height

        # Zero width
        if width == 0:
            is_at_edge = bbox.x0 == 0 or bbox.x0 == page_width
            if is_at_edge:
                return UnconsumedBlockInfo(
                    block=block,
                    category=UnconsumedCategory.PAGE_EDGE_LINE,
                    reason=f"Zero-width line at page edge (x={bbox.x0})",
                    recommendation="Filter page-edge lines in block_filter.py",
                )
            return UnconsumedBlockInfo(
                block=block,
                category=UnconsumedCategory.ZERO_WIDTH,
                reason=f"Zero-width drawing at x={bbox.x0}",
                recommendation="Filter zero-width drawings in block_filter.py",
            )

        # Zero height
        if height == 0:
            is_at_edge = bbox.y0 == 0 or bbox.y0 == page_height
            if is_at_edge:
                return UnconsumedBlockInfo(
                    block=block,
                    category=UnconsumedCategory.PAGE_EDGE_LINE,
                    reason=f"Zero-height line at page edge (y={bbox.y0})",
                    recommendation="Filter page-edge lines in block_filter.py",
                )
            return UnconsumedBlockInfo(
                block=block,
                category=UnconsumedCategory.ZERO_HEIGHT,
                reason=f"Zero-height drawing at y={bbox.y0}",
                recommendation="Filter zero-height drawings in block_filter.py",
            )

        # Small dot (area < 25 sq pts, roughly 5x5 or less)
        if bbox.area < 25:
            return UnconsumedBlockInfo(
                block=block,
                category=UnconsumedCategory.SMALL_DOT,
                reason=f"Very small drawing (area={bbox.area:.1f} sq pts)",
                recommendation="Consider if this is a significant element or artifact",
            )

        # Large unclassified drawing (> 5% of page area)
        page_area = page_width * page_height
        if bbox.area > page_area * 0.05:
            return UnconsumedBlockInfo(
                block=block,
                category=UnconsumedCategory.LARGE_UNCLASSIFIED,
                reason=f"Large drawing ({bbox.area / page_area * 100:.1f}% of page)",
                recommendation=(
                    "Review if this should be a background, diagram, or other element"
                ),
            )

    # Check for whitespace-only text
    if isinstance(block, Text):
        if block.text.strip() == "":
            return UnconsumedBlockInfo(
                block=block,
                category=UnconsumedCategory.WHITESPACE_TEXT,
                reason="Text contains only whitespace",
                recommendation="Filter whitespace-only text in block_filter.py",
            )

        # Check for copyright/trademark text
        copyright_keywords = {
            "©",
            "™",
            "®",
            "copyright",
            "trademark",
            "lego.com",
            "lucasfilm",
            "disney",
            "marcas registradas",
        }
        text_lower = block.text.lower()
        if any(kw in text_lower for kw in copyright_keywords):
            return UnconsumedBlockInfo(
                block=block,
                category=UnconsumedCategory.COPYRIGHT_TEXT,
                reason=f"Copyright/trademark text: '{block.text[:40]}...'",
                recommendation="Add classifier for legal/copyright text",
            )

    # Check for unclassified images
    if isinstance(block, Image):
        return UnconsumedBlockInfo(
            block=block,
            category=UnconsumedCategory.IMAGE_IN_COMPLEX_PAGE,
            reason="Image not consumed by any element",
            recommendation=(
                "Review if this should be a diagram, part_image, or other element"
            ),
        )

    # Default: unknown category
    return UnconsumedBlockInfo(
        block=block,
        category=UnconsumedCategory.UNKNOWN,
        reason="Does not match known unconsumed patterns",
        recommendation="Manual review required",
    )


def get_unconsumed_blocks(result: ClassificationResult) -> list[Blocks]:
    """Get all unconsumed blocks from a classification result.

    Args:
        result: Classification result to check

    Returns:
        List of blocks that are unconsumed (no candidate and not removed)
    """
    unconsumed = []
    for block in result.page_data.blocks:
        # Check if block is consumed by a constructed candidate
        best_candidate = result.get_best_candidate(block)
        if best_candidate:
            continue

        # Check if block was explicitly removed
        if result.is_removed(block):
            continue

        # Block is unconsumed
        unconsumed.append(block)

    return unconsumed


def analyze_unconsumed_blocks(
    result: ClassificationResult,
) -> dict[UnconsumedCategory, list[UnconsumedBlockInfo]]:
    """Analyze all unconsumed blocks and group by category.

    Args:
        result: Classification result to analyze

    Returns:
        Dictionary mapping categories to lists of UnconsumedBlockInfo
    """
    categorized: dict[UnconsumedCategory, list[UnconsumedBlockInfo]] = defaultdict(list)

    unconsumed = get_unconsumed_blocks(result)
    for block in unconsumed:
        info = categorize_unconsumed_block(block, result.page_data)
        categorized[info.category].append(info)

    return dict(categorized)


def print_unconsumed_diagnostics(
    results: list[ClassificationResult],
    *,
    show_details: bool = True,
) -> None:
    """Print diagnostic report for unconsumed blocks across all pages.

    Args:
        results: List of classification results
        show_details: If True, show individual block details
    """
    # Aggregate statistics
    total_unconsumed = 0
    category_counts: dict[UnconsumedCategory, int] = defaultdict(int)
    pages_with_unconsumed: list[tuple[int, dict]] = []

    for result in results:
        if result.skipped_reason:
            continue

        analysis = analyze_unconsumed_blocks(result)
        if not analysis:
            continue

        page_num = result.page_data.page_number
        page_total = sum(len(blocks) for blocks in analysis.values())
        total_unconsumed += page_total

        for category, blocks in analysis.items():
            category_counts[category] += len(blocks)

        pages_with_unconsumed.append((page_num, analysis))

    if total_unconsumed == 0:
        print("\n✓ All blocks are consumed!")
        return

    print(f"\n{'=' * 80}")
    print("UNCONSUMED BLOCK DIAGNOSTICS")
    print(f"{'=' * 80}")
    print(f"\nTotal unconsumed blocks: {total_unconsumed}")
    print(f"Pages with unconsumed blocks: {len(pages_with_unconsumed)}")

    # Print category summary
    print("\nBy Category:")
    print("-" * 60)
    sorted_categories = sorted(
        category_counts.items(), key=lambda x: x[1], reverse=True
    )
    for category, count in sorted_categories:
        pct = count / total_unconsumed * 100
        print(f"  {category.name:25} {count:5d} ({pct:5.1f}%)")

    # Print recommendations
    print("\nRecommendations:")
    print("-" * 60)
    recommendations: dict[str, int] = defaultdict(int)

    for _page_num, analysis in pages_with_unconsumed:
        for _category, blocks in analysis.items():
            for info in blocks:
                recommendations[info.recommendation] += 1

    for rec, count in sorted(recommendations.items(), key=lambda x: -x[1]):
        print(f"  • {rec} ({count} blocks)")

    # Print per-page details if requested
    if show_details:
        print(f"\n{'=' * 80}")
        print("PER-PAGE DETAILS")
        print(f"{'=' * 80}")

        for page_num, analysis in pages_with_unconsumed:
            page_total = sum(len(blocks) for blocks in analysis.values())
            print(f"\nPage {page_num}: {page_total} unconsumed blocks")

            for category, blocks in sorted(analysis.items(), key=lambda x: x[0].name):
                if not blocks:
                    continue
                print(f"  {category.name}:")
                for info in blocks[:5]:  # Limit to 5 per category per page
                    block = info.block
                    bbox_str = (
                        f"({block.bbox.x0:.1f},{block.bbox.y0:.1f},"
                        f"{block.bbox.x1:.1f},{block.bbox.y1:.1f})"
                    )
                    print(f"    #{block.id} {type(block).__name__} {bbox_str}")
                    print(f"       Reason: {info.reason}")

                if len(blocks) > 5:
                    print(f"    ... and {len(blocks) - 5} more")

1	"""Diagnostic utilities for analyzing unconsumed blocks.
2
3	This module provides tools to categorize and explain why blocks weren't consumed
4	by any LEGO page element. It helps identify patterns in unconsumed blocks and
5	provides actionable recommendations.
6	"""
7
8	from __future__ import annotations	1✔
9
10	from collections import defaultdict	1✔
11	from dataclasses import dataclass	1✔
12	from enum import Enum, auto	1✔
13
14	from build_a_long.pdf_extract.classifier.classification_result import (	1✔
15	ClassificationResult,
16	)
17	from build_a_long.pdf_extract.extractor.extractor import PageData	1✔
18	from build_a_long.pdf_extract.extractor.page_blocks import Blocks, Drawing, Image, Text	1✔
19
20
21	class UnconsumedCategory(Enum):	1✔
22	"""Categories of unconsumed blocks."""
23
24	ZERO_WIDTH = auto()	1✔
25	"""Drawing with zero width (x0 == x1)"""	1✔
26
27	ZERO_HEIGHT = auto()	1✔
28	"""Drawing with zero height (y0 == y1)"""	1✔
29
30	PAGE_EDGE_LINE = auto()	1✔
31	"""Line at page boundary (x=0 or x=page_width)"""	1✔
32
33	WHITESPACE_TEXT = auto()	1✔
34	"""Text containing only whitespace"""	1✔
35
36	COPYRIGHT_TEXT = auto()	1✔
37	"""Copyright or trademark text on info pages"""	1✔
38
39	SMALL_DOT = auto()	1✔
40	"""Very small drawing (likely a dot or artifact)"""	1✔
41
42	LARGE_UNCLASSIFIED = auto()	1✔
43	"""Large drawing that didn't match any classifier"""	1✔
44
45	IMAGE_IN_COMPLEX_PAGE = auto()	1✔
46	"""Image on catalog/info page that wasn't assigned"""	1✔
47
48	UNKNOWN = auto()	1✔
49	"""Block that doesn't fit known categories"""	1✔
50
51
52	@dataclass	1✔
53	class UnconsumedBlockInfo:	1✔
54	"""Information about an unconsumed block."""
55
56	block: Blocks	1✔
57	category: UnconsumedCategory	1✔
58	reason: str	1✔
59	recommendation: str	1✔
60
61
62	def categorize_unconsumed_block(	1✔
63	block: Blocks,
64	page_data: PageData,
65	) -> UnconsumedBlockInfo:
66	"""Categorize an unconsumed block and provide actionable information.
67
68	Args:
69	block: The unconsumed block
70	page_data: Page data for context (page dimensions, etc.)
71
72	Returns:
73	UnconsumedBlockInfo with category, reason, and recommendation
74	"""
75	page_width = page_data.bbox.width	×
76	page_height = page_data.bbox.height	×
77
78	# Check for zero-dimension drawings
79	if isinstance(block, Drawing):	×
80	bbox = block.bbox	×
81	width = bbox.width	×
82	height = bbox.height	×
83
84	# Zero width
85	if width == 0:	×
86	is_at_edge = bbox.x0 == 0 or bbox.x0 == page_width	×
87	if is_at_edge:	×
88	return UnconsumedBlockInfo(	×
89	block=block,
90	category=UnconsumedCategory.PAGE_EDGE_LINE,
91	reason=f"Zero-width line at page edge (x={bbox.x0})",
92	recommendation="Filter page-edge lines in block_filter.py",
93	)
94	return UnconsumedBlockInfo(	×
95	block=block,
96	category=UnconsumedCategory.ZERO_WIDTH,
97	reason=f"Zero-width drawing at x={bbox.x0}",
98	recommendation="Filter zero-width drawings in block_filter.py",
99	)
100
101	# Zero height
102	if height == 0:	×
103	is_at_edge = bbox.y0 == 0 or bbox.y0 == page_height	×
104	if is_at_edge:	×
105	return UnconsumedBlockInfo(	×
106	block=block,
107	category=UnconsumedCategory.PAGE_EDGE_LINE,
108	reason=f"Zero-height line at page edge (y={bbox.y0})",
109	recommendation="Filter page-edge lines in block_filter.py",
110	)
111	return UnconsumedBlockInfo(	×
112	block=block,
113	category=UnconsumedCategory.ZERO_HEIGHT,
114	reason=f"Zero-height drawing at y={bbox.y0}",
115	recommendation="Filter zero-height drawings in block_filter.py",
116	)
117
118	# Small dot (area < 25 sq pts, roughly 5x5 or less)
119	if bbox.area < 25:	×
120	return UnconsumedBlockInfo(	×
121	block=block,
122	category=UnconsumedCategory.SMALL_DOT,
123	reason=f"Very small drawing (area={bbox.area:.1f} sq pts)",
124	recommendation="Consider if this is a significant element or artifact",
125	)
126
127	# Large unclassified drawing (> 5% of page area)
128	page_area = page_width * page_height	×
129	if bbox.area > page_area * 0.05:	×
130	return UnconsumedBlockInfo(	×
131	block=block,
132	category=UnconsumedCategory.LARGE_UNCLASSIFIED,
133	reason=f"Large drawing ({bbox.area / page_area * 100:.1f}% of page)",
134	recommendation=(
135	"Review if this should be a background, diagram, or other element"
136	),
137	)
138
139	# Check for whitespace-only text
140	if isinstance(block, Text):	×
141	if block.text.strip() == "":	×
142	return UnconsumedBlockInfo(	×
143	block=block,
144	category=UnconsumedCategory.WHITESPACE_TEXT,
145	reason="Text contains only whitespace",
146	recommendation="Filter whitespace-only text in block_filter.py",
147	)
148
149	# Check for copyright/trademark text
150	copyright_keywords = {	×
151	"©",
152	"™",
153	"®",
154	"copyright",
155	"trademark",
156	"lego.com",
157	"lucasfilm",
158	"disney",
159	"marcas registradas",
160	}
161	text_lower = block.text.lower()	×
162	if any(kw in text_lower for kw in copyright_keywords):	×
163	return UnconsumedBlockInfo(	×
164	block=block,
165	category=UnconsumedCategory.COPYRIGHT_TEXT,
166	reason=f"Copyright/trademark text: '{block.text[:40]}...'",
167	recommendation="Add classifier for legal/copyright text",
168	)
169
170	# Check for unclassified images
171	if isinstance(block, Image):	×
172	return UnconsumedBlockInfo(	×
173	block=block,
174	category=UnconsumedCategory.IMAGE_IN_COMPLEX_PAGE,
175	reason="Image not consumed by any element",
176	recommendation=(
177	"Review if this should be a diagram, part_image, or other element"
178	),
179	)
180
181	# Default: unknown category
182	return UnconsumedBlockInfo(	×
183	block=block,
184	category=UnconsumedCategory.UNKNOWN,
185	reason="Does not match known unconsumed patterns",
186	recommendation="Manual review required",
187	)
188
189
190	def get_unconsumed_blocks(result: ClassificationResult) -> list[Blocks]:	1✔
191	"""Get all unconsumed blocks from a classification result.
192
193	Args:
194	result: Classification result to check
195
196	Returns:
197	List of blocks that are unconsumed (no candidate and not removed)
198	"""
199	unconsumed = []	×
200	for block in result.page_data.blocks:	×
201	# Check if block is consumed by a constructed candidate
202	best_candidate = result.get_best_candidate(block)	×
203	if best_candidate:	×
204	continue	×
205
206	# Check if block was explicitly removed
207	if result.is_removed(block):	×
208	continue	×
209
210	# Block is unconsumed
211	unconsumed.append(block)	×
212
213	return unconsumed	×
214
215
216	def analyze_unconsumed_blocks(	1✔
217	result: ClassificationResult,
218	) -> dict[UnconsumedCategory, list[UnconsumedBlockInfo]]:
219	"""Analyze all unconsumed blocks and group by category.
220
221	Args:
222	result: Classification result to analyze
223
224	Returns:
225	Dictionary mapping categories to lists of UnconsumedBlockInfo
226	"""
227	categorized: dict[UnconsumedCategory, list[UnconsumedBlockInfo]] = defaultdict(list)	×
228
229	unconsumed = get_unconsumed_blocks(result)	×
230	for block in unconsumed:	×
231	info = categorize_unconsumed_block(block, result.page_data)	×
232	categorized[info.category].append(info)	×
233
234	return dict(categorized)	×
235
236
237	def print_unconsumed_diagnostics(	1✔
238	results: list[ClassificationResult],
239	*,
240	show_details: bool = True,
241	) -> None:
242	"""Print diagnostic report for unconsumed blocks across all pages.
243
244	Args:
245	results: List of classification results
246	show_details: If True, show individual block details
247	"""
248	# Aggregate statistics
249	total_unconsumed = 0	×
250	category_counts: dict[UnconsumedCategory, int] = defaultdict(int)	×
251	pages_with_unconsumed: list[tuple[int, dict]] = []	×
252
253	for result in results:	×
254	if result.skipped_reason:	×
255	continue	×
256
257	analysis = analyze_unconsumed_blocks(result)	×
258	if not analysis:	×
259	continue	×
260
261	page_num = result.page_data.page_number	×
262	page_total = sum(len(blocks) for blocks in analysis.values())	×
263	total_unconsumed += page_total	×
264
265	for category, blocks in analysis.items():	×
266	category_counts[category] += len(blocks)	×
267
268	pages_with_unconsumed.append((page_num, analysis))	×
269
270	if total_unconsumed == 0:	×
271	print("\n✓ All blocks are consumed!")	×
272	return	×
273
274	print(f"\n{'=' * 80}")	×
275	print("UNCONSUMED BLOCK DIAGNOSTICS")	×
276	print(f"{'=' * 80}")	×
277	print(f"\nTotal unconsumed blocks: {total_unconsumed}")	×
278	print(f"Pages with unconsumed blocks: {len(pages_with_unconsumed)}")	×
279
280	# Print category summary
281	print("\nBy Category:")	×
282	print("-" * 60)	×
283	sorted_categories = sorted(	×
284	category_counts.items(), key=lambda x: x[1], reverse=True
285	)
286	for category, count in sorted_categories:	×
287	pct = count / total_unconsumed * 100	×
288	print(f" {category.name:25} {count:5d} ({pct:5.1f}%)")	×
289
290	# Print recommendations
291	print("\nRecommendations:")	×
292	print("-" * 60)	×
293	recommendations: dict[str, int] = defaultdict(int)	×
294
295	for _page_num, analysis in pages_with_unconsumed:	×
296	for _category, blocks in analysis.items():	×
297	for info in blocks:	×
298	recommendations[info.recommendation] += 1	×
299
300	for rec, count in sorted(recommendations.items(), key=lambda x: -x[1]):	×
301	print(f" • {rec} ({count} blocks)")	×
302
303	# Print per-page details if requested
304	if show_details:	×
305	print(f"\n{'=' * 80}")	×
306	print("PER-PAGE DETAILS")	×
307	print(f"{'=' * 80}")	×
308
309	for page_num, analysis in pages_with_unconsumed:	×
310	page_total = sum(len(blocks) for blocks in analysis.values())	×
311	print(f"\nPage {page_num}: {page_total} unconsumed blocks")	×
312
313	for category, blocks in sorted(analysis.items(), key=lambda x: x[0].name):	×
314	if not blocks:	×
315	continue	×
316	print(f" {category.name}:")	×
317	for info in blocks[:5]: # Limit to 5 per category per page	×
318	block = info.block	×
319	bbox_str = (	×
320	f"({block.bbox.x0:.1f},{block.bbox.y0:.1f},"
321	f"{block.bbox.x1:.1f},{block.bbox.y1:.1f})"
322	)
323	print(f" #{block.id} {type(block).__name__} {bbox_str}")	×
324	print(f" Reason: {info.reason}")	×
325
326	if len(blocks) > 5:	×
327	print(f" ... and {len(blocks) - 5} more")	×

bramp / build-along / 20400711546

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous