19155446196

Committed 06 Nov 2025 04:44AM UTC coverage: 85.36% (-0.02%) from 85.381%

Build # 19155446196

Build Type

push

github

Committed by

bramp

Commit Message

refactor: complete Block rename and terminology cleanup

Renamed page_elements.py → page_blocks.py and systematically updated all
references to use 'block' terminology for raw PDF primitives throughout
the codebase.

Key changes:
- Renamed Element class → Block in page_blocks.py
- Updated all imports and type references across 40+ files
- Renamed internal variables and method parameters:
  - _element_winners → _block_winners
  - _validate_element_in_page_data() → _validate_block_in_page_data()
  - element_to_labels → block_to_labels
  - total_elements → total_blocks
  - And many more variable renames in main.py, tests, and classifiers
- Updated all docstrings, comments, and error messages
- Updated JSON fixtures to use 'blocks' instead of 'elements'
- Updated documentation (README files)

Terminology is now consistent:
- Block = raw PDF primitive (Text, Image, Drawing from pymupdf)
- Element = LEGO semantic component (Part, StepNumber, PartsList, etc.)

All 20 tests passing.

Run Details

472 of 535 new or added lines in 34 files covered. (88.22%)

6 existing lines in 3 files now uncovered.

4064 of 4761 relevant lines covered (85.36%)

0.85 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.3

/src/build_a_long/pdf_extract/classifier/part_count_classifier.py

"""
Part count classifier.

Purpose
-------
Detect part-count text like "2x", "3X", or "5×".

Debugging
---------
Enable DEBUG logs with LOG_LEVEL=DEBUG. Heavier trace can be enabled when
CLASSIFIER_DEBUG is set to "part_count" or "all".
"""

import logging
import os
from dataclasses import dataclass

from build_a_long.pdf_extract.classifier.classification_result import (
    Candidate,
    ClassificationHints,
    ClassificationResult,
    ClassifierConfig,
)
from build_a_long.pdf_extract.classifier.label_classifier import (
    LabelClassifier,
)
from build_a_long.pdf_extract.classifier.text_extractors import (
    extract_part_count_value,
)
from build_a_long.pdf_extract.extractor import PageData
from build_a_long.pdf_extract.extractor.lego_page_elements import PartCount
from build_a_long.pdf_extract.extractor.page_blocks import Text

log = logging.getLogger(__name__)


@dataclass
class _PartCountScore:
    """Internal score representation for part count classification."""

    text_score: float
    """Score based on how well the text matches part count patterns (0.0-1.0)."""

    def combined_score(self, config: ClassifierConfig) -> float:
        """Calculate final weighted score from components.

        For part count, we only have text_score, so just return it directly.
        """
        return self.text_score


class PartCountClassifier(LabelClassifier):
    """Classifier for part counts."""

    outputs = {"part_count"}
    requires = set()

    def __init__(self, config: ClassifierConfig, classifier):
        super().__init__(config, classifier)
        # Can the following go into the parent, and use "outputs" as identifier?
        self._debug_enabled = os.getenv("CLASSIFIER_DEBUG", "").lower() in (
            "part_count",
            "all",
        )

    def evaluate(
        self,
        page_data: PageData,
        result: ClassificationResult,
    ) -> None:
        """Evaluate elements and create candidates for part counts.

        This method scores each text element, attempts to construct PartCount objects,
        and stores all candidates with their scores and any failure reasons.
        """
        if not page_data.blocks:
            return

        for block in page_data.blocks:
            if not isinstance(block, Text):
                continue

            text_score = PartCountClassifier._score_part_count_text(block.text)

            # Store detailed score object
            detail_score = _PartCountScore(text_score=text_score)

            if self._debug_enabled:
                log.debug(
                    "[part_count] match text=%r score=%.2f bbox=%s",
                    block.text,
                    text_score,
                    block.bbox,
                )

            # Try to construct (parse part count value)
            value = extract_part_count_value(block.text)
            constructed_elem = None
            failure_reason = None

            if text_score == 0.0:
                failure_reason = (
                    f"Text doesn't match part count pattern: '{block.text}'"
                )
            elif value is None:
                failure_reason = f"Could not parse part count from text: '{block.text}'"
            else:
                constructed_elem = PartCount(
                    count=value,
                    bbox=block.bbox,
                )

            # Add candidate
            result.add_candidate(
                "part_count",
                Candidate(
                    bbox=block.bbox,
                    label="part_count",
                    score=detail_score.combined_score(self.config),
                    score_details=detail_score,
                    constructed=constructed_elem,
                    source_block=block,
                    failure_reason=failure_reason,
                    is_winner=False,  # Will be set by classify()
                ),
            )

    def classify(
        self,
        page_data: PageData,
        result: ClassificationResult,
        hints: ClassificationHints | None,
    ) -> None:
        """Select winning part counts from pre-built candidates."""
        # Get pre-built candidates
        candidate_list = result.get_candidates("part_count")

        # Mark winners (all successfully constructed candidates)
        for candidate in candidate_list:
            if candidate.constructed is None:
                # Already has failure_reason from calculate_scores
                continue

            # This is a winner!
            assert isinstance(candidate.constructed, PartCount)
            result.mark_winner(candidate, candidate.constructed)
            self.classifier._remove_child_bboxes(
                page_data, candidate.source_block, result
            )
            self.classifier._remove_similar_bboxes(
                page_data, candidate.source_block, result
            )

    @staticmethod
    def _score_part_count_text(text: str) -> float:
        """Score text based on how well it matches part count patterns.

        Returns:
            1.0 if text matches part count pattern, 0.0 otherwise
        """
        # Use the extraction function to validate format
        if extract_part_count_value(text) is not None:
            return 1.0
        return 0.0

1	"""
2	Part count classifier.
3
4	Purpose
5	-------
6	Detect part-count text like "2x", "3X", or "5×".
7
8	Debugging
9	---------
10	Enable DEBUG logs with LOG_LEVEL=DEBUG. Heavier trace can be enabled when
11	CLASSIFIER_DEBUG is set to "part_count" or "all".
12	"""
13
14	import logging	1✔
15	import os	1✔
16	from dataclasses import dataclass	1✔
17
18	from build_a_long.pdf_extract.classifier.classification_result import (	1✔
19	Candidate,
20	ClassificationHints,
21	ClassificationResult,
22	ClassifierConfig,
23	)
24	from build_a_long.pdf_extract.classifier.label_classifier import (	1✔
25	LabelClassifier,
26	)
27	from build_a_long.pdf_extract.classifier.text_extractors import (	1✔
28	extract_part_count_value,
29	)
30	from build_a_long.pdf_extract.extractor import PageData	1✔
31	from build_a_long.pdf_extract.extractor.lego_page_elements import PartCount	1✔
32	from build_a_long.pdf_extract.extractor.page_blocks import Text	1✔
33
34	log = logging.getLogger(__name__)	1✔
35
36
37	@dataclass	1✔
38	class _PartCountScore:	1✔
39	"""Internal score representation for part count classification."""
40
41	text_score: float
42	"""Score based on how well the text matches part count patterns (0.0-1.0)."""	1✔
43
44	def combined_score(self, config: ClassifierConfig) -> float:	1✔
45	"""Calculate final weighted score from components.
46
47	For part count, we only have text_score, so just return it directly.
48	"""
49	return self.text_score	1✔
50
51
52	class PartCountClassifier(LabelClassifier):	1✔
53	"""Classifier for part counts."""
54
55	outputs = {"part_count"}	1✔
56	requires = set()	1✔
57
58	def __init__(self, config: ClassifierConfig, classifier):	1✔
59	super().__init__(config, classifier)	1✔
60	# Can the following go into the parent, and use "outputs" as identifier?
61	self._debug_enabled = os.getenv("CLASSIFIER_DEBUG", "").lower() in (	1✔
62	"part_count",
63	"all",
64	)
65
66	def evaluate(	1✔
67	self,
68	page_data: PageData,
69	result: ClassificationResult,
70	) -> None:
71	"""Evaluate elements and create candidates for part counts.
72
73	This method scores each text element, attempts to construct PartCount objects,
74	and stores all candidates with their scores and any failure reasons.
75	"""
76	if not page_data.blocks:	1✔
77	return	1✔
78
79	for block in page_data.blocks:	1✔
80	if not isinstance(block, Text):	1✔
81	continue	1✔
82
83	text_score = PartCountClassifier._score_part_count_text(block.text)	1✔
84
85	# Store detailed score object
86	detail_score = _PartCountScore(text_score=text_score)	1✔
87
88	if self._debug_enabled:	1✔
89	log.debug(	×
90	"[part_count] match text=%r score=%.2f bbox=%s",
91	block.text,
92	text_score,
93	block.bbox,
94	)
95
96	# Try to construct (parse part count value)
97	value = extract_part_count_value(block.text)	1✔
98	constructed_elem = None	1✔
99	failure_reason = None	1✔
100
101	if text_score == 0.0:	1✔
102	failure_reason = (	1✔
103	f"Text doesn't match part count pattern: '{block.text}'"
104	)
105	elif value is None:	1✔
NEW 106	failure_reason = f"Could not parse part count from text: '{block.text}'"	×
107	else:
108	constructed_elem = PartCount(	1✔
109	count=value,
110	bbox=block.bbox,
111	)
112
113	# Add candidate
114	result.add_candidate(	1✔
115	"part_count",
116	Candidate(
117	bbox=block.bbox,
118	label="part_count",
119	score=detail_score.combined_score(self.config),
120	score_details=detail_score,
121	constructed=constructed_elem,
122	source_block=block,
123	failure_reason=failure_reason,
124	is_winner=False, # Will be set by classify()
125	),
126	)
127
128	def classify(	1✔
129	self,
130	page_data: PageData,
131	result: ClassificationResult,
132	hints: ClassificationHints \| None,
133	) -> None:
134	"""Select winning part counts from pre-built candidates."""
135	# Get pre-built candidates
136	candidate_list = result.get_candidates("part_count")	1✔
137
138	# Mark winners (all successfully constructed candidates)
139	for candidate in candidate_list:	1✔
140	if candidate.constructed is None:	1✔
141	# Already has failure_reason from calculate_scores
142	continue	1✔
143
144	# This is a winner!
145	assert isinstance(candidate.constructed, PartCount)	1✔
146	result.mark_winner(candidate, candidate.constructed)	1✔
147	self.classifier._remove_child_bboxes(	1✔
148	page_data, candidate.source_block, result
149	)
150	self.classifier._remove_similar_bboxes(	1✔
151	page_data, candidate.source_block, result
152	)
153
154	@staticmethod	1✔
155	def _score_part_count_text(text: str) -> float:	1✔
156	"""Score text based on how well it matches part count patterns.
157
158	Returns:
159	1.0 if text matches part count pattern, 0.0 otherwise
160	"""
161	# Use the extraction function to validate format
162	if extract_part_count_value(text) is not None:	1✔
163	return 1.0	1✔
164	return 0.0	1✔

bramp / build-along / 19155446196

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous