19155446196

Committed 06 Nov 2025 04:44AM UTC coverage: 85.36% (-0.02%) from 85.381%

Build # 19155446196

Build Type

push

github

Committed by

bramp

Commit Message

refactor: complete Block rename and terminology cleanup

Renamed page_elements.py → page_blocks.py and systematically updated all
references to use 'block' terminology for raw PDF primitives throughout
the codebase.

Key changes:
- Renamed Element class → Block in page_blocks.py
- Updated all imports and type references across 40+ files
- Renamed internal variables and method parameters:
  - _element_winners → _block_winners
  - _validate_element_in_page_data() → _validate_block_in_page_data()
  - element_to_labels → block_to_labels
  - total_elements → total_blocks
  - And many more variable renames in main.py, tests, and classifiers
- Updated all docstrings, comments, and error messages
- Updated JSON fixtures to use 'blocks' instead of 'elements'
- Updated documentation (README files)

Terminology is now consistent:
- Block = raw PDF primitive (Text, Image, Drawing from pymupdf)
- Element = LEGO semantic component (Part, StepNumber, PartsList, etc.)

All 20 tests passing.

Run Details

472 of 535 new or added lines in 34 files covered. (88.22%)

6 existing lines in 3 files now uncovered.

4064 of 4761 relevant lines covered (85.36%)

0.85 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.5

/src/build_a_long/pdf_extract/classifier/step_number_classifier.py

"""
Step number classifier.
"""

from dataclasses import dataclass

from build_a_long.pdf_extract.classifier.classification_result import (
    Candidate,
    ClassificationHints,
    ClassificationResult,
    ClassifierConfig,
)
from build_a_long.pdf_extract.classifier.label_classifier import (
    LabelClassifier,
)
from build_a_long.pdf_extract.classifier.text_extractors import (
    extract_step_number_value,
)
from build_a_long.pdf_extract.extractor import PageData
from build_a_long.pdf_extract.extractor.lego_page_elements import StepNumber
from build_a_long.pdf_extract.extractor.page_blocks import Text


@dataclass
class _StepNumberScore:
    """Internal score representation for step number classification."""

    text_score: float
    """Score based on how well the text matches step number patterns (0.0-1.0)."""

    size_score: float
    """Score based on element height relative to page number height (0.0-1.0)."""

    def combined_score(self, config: ClassifierConfig) -> float:
        """Calculate final weighted score from components."""
        # Sum the weighted components
        score = (
            config.step_number_text_weight * self.text_score
            + config.step_number_size_weight * self.size_score
        )
        # Normalize by the sum of weights to keep score in [0, 1]
        total_weight = config.step_number_text_weight + config.step_number_size_weight
        return score / total_weight if total_weight > 0 else 0.0


class StepNumberClassifier(LabelClassifier):
    """Classifier for step numbers."""

    outputs = {"step_number"}
    requires = {"page_number"}

    def _score_step_number_size(
        self, element: Text, page_num_height: float | None
    ) -> float:
        """Score based on element height relative to page number height.

        Returns 0.0 if element is not significantly taller than page number,
        scaling up to 1.0 as element gets taller.
        """
        if not page_num_height or page_num_height <= 0.0:
            return 0.0

        h = element.bbox.height
        if h <= page_num_height * 1.1:
            return 0.0

        ratio_over = (h / page_num_height) - 1.0
        return max(0.0, min(1.0, ratio_over / 0.5))

    def evaluate(
        self,
        page_data: PageData,
        result: ClassificationResult,
    ) -> None:
        """Evaluate elements and create candidates for step numbers.

        This method scores each text element, attempts to construct StepNumber objects,
        and stores all candidates with their scores and any failure reasons.
        """
        if not page_data.blocks:
            return

        page_num_height: float | None = None
        # Find the page_number block to use for size comparison
        labeled_blocks = result.get_labeled_blocks()
        for block in page_data.blocks:
            if labeled_blocks.get(block) == "page_number":
                page_num_height = block.bbox.height
                break

        # Get page bbox and height for bottom band check
        page_bbox = page_data.bbox
        assert page_bbox is not None
        page_height = page_bbox.height

        for block in page_data.blocks:
            if not isinstance(block, Text):
                continue

            # Skip blocks in the bottom 10% of the page where page numbers typically appear
            block_center_y = (block.bbox.y0 + block.bbox.y1) / 2
            bottom_threshold = page_bbox.y1 - (page_height * 0.1)
            if block_center_y >= bottom_threshold:
                continue

            text_score = self._score_step_number_text(block.text)
            if text_score == 0.0:
                continue

            size_score = self._score_step_number_size(block, page_num_height)

            # If we have a page number for size comparison, require the block to be
            # taller than the page number (size_score > 0). This prevents small
            # numeric text from being classified as step numbers.
            if page_num_height and size_score == 0.0:
                continue

            # Store detailed score object
            detail_score = _StepNumberScore(
                text_score=text_score,
                size_score=size_score,
            )

            # Try to construct (parse step number value)
            value = extract_step_number_value(block.text)
            constructed_elem = None
            failure_reason = None

            if value is not None:
                constructed_elem = StepNumber(
                    value=value,
                    bbox=block.bbox,
                )
            else:
                failure_reason = (
                    f"Could not parse step number from text: '{block.text}'"
                )

            # Add candidate
            result.add_candidate(
                "step_number",
                Candidate(
                    bbox=block.bbox,
                    label="step_number",
                    score=detail_score.combined_score(self.config),
                    score_details=detail_score,
                    constructed=constructed_elem,
                    source_block=block,
                    failure_reason=failure_reason,
                    is_winner=False,  # Will be set by classify()
                ),
            )

    def classify(
        self,
        page_data: PageData,
        result: ClassificationResult,
        hints: ClassificationHints | None,
    ) -> None:
        """Select winning step numbers from pre-built candidates."""
        # Get pre-built candidates
        candidate_list = result.get_candidates("step_number")

        # Find the page number block to avoid classifying it as a step number
        page_number_blocks = result.get_blocks_by_label("page_number")
        page_number_block = page_number_blocks[0] if page_number_blocks else None

        # Mark winners (all successfully constructed candidates that aren't the page number
        # and meet the confidence threshold)
        for candidate in candidate_list:
            if candidate.source_block is page_number_block:
                # Don't classify page number as step number
                candidate.failure_reason = "Block is already labeled as page_number"
                continue

            if candidate.score < self.config.min_confidence_threshold:
                candidate.failure_reason = (
                    f"Score {candidate.score:.2f} below threshold "
                    f"{self.config.min_confidence_threshold}"
                )
                continue

            if candidate.constructed is None:
                # Already has failure_reason from calculate_scores
                continue

            # This is a winner!
            assert isinstance(candidate.constructed, StepNumber)
            result.mark_winner(candidate, candidate.constructed)
            self.classifier._remove_child_bboxes(
                page_data, candidate.source_block, result
            )
            self.classifier._remove_similar_bboxes(
                page_data, candidate.source_block, result
            )

    def _score_step_number_text(self, text: str) -> float:
        """Score text based on how well it matches step number patterns.

        Returns:
            1.0 if text matches step number pattern, 0.0 otherwise
        """
        # Use the extraction function to validate format
        if extract_step_number_value(text) is not None:
            return 1.0
        return 0.0

1	"""
2	Step number classifier.
3	"""
4
5	from dataclasses import dataclass	1✔
6
7	from build_a_long.pdf_extract.classifier.classification_result import (	1✔
8	Candidate,
9	ClassificationHints,
10	ClassificationResult,
11	ClassifierConfig,
12	)
13	from build_a_long.pdf_extract.classifier.label_classifier import (	1✔
14	LabelClassifier,
15	)
16	from build_a_long.pdf_extract.classifier.text_extractors import (	1✔
17	extract_step_number_value,
18	)
19	from build_a_long.pdf_extract.extractor import PageData	1✔
20	from build_a_long.pdf_extract.extractor.lego_page_elements import StepNumber	1✔
21	from build_a_long.pdf_extract.extractor.page_blocks import Text	1✔
22
23
24	@dataclass	1✔
25	class _StepNumberScore:	1✔
26	"""Internal score representation for step number classification."""
27
28	text_score: float
29	"""Score based on how well the text matches step number patterns (0.0-1.0)."""	1✔
30
31	size_score: float
32	"""Score based on element height relative to page number height (0.0-1.0)."""	1✔
33
34	def combined_score(self, config: ClassifierConfig) -> float:	1✔
35	"""Calculate final weighted score from components."""
36	# Sum the weighted components
37	score = (	1✔
38	config.step_number_text_weight * self.text_score
39	+ config.step_number_size_weight * self.size_score
40	)
41	# Normalize by the sum of weights to keep score in [0, 1]
42	total_weight = config.step_number_text_weight + config.step_number_size_weight	1✔
43	return score / total_weight if total_weight > 0 else 0.0	1✔
44
45
46	class StepNumberClassifier(LabelClassifier):	1✔
47	"""Classifier for step numbers."""
48
49	outputs = {"step_number"}	1✔
50	requires = {"page_number"}	1✔
51
52	def _score_step_number_size(	1✔
53	self, element: Text, page_num_height: float \| None
54	) -> float:
55	"""Score based on element height relative to page number height.
56
57	Returns 0.0 if element is not significantly taller than page number,
58	scaling up to 1.0 as element gets taller.
59	"""
60	if not page_num_height or page_num_height <= 0.0:	1✔
61	return 0.0	1✔
62
63	h = element.bbox.height	1✔
64	if h <= page_num_height * 1.1:	1✔
65	return 0.0	1✔
66
67	ratio_over = (h / page_num_height) - 1.0	1✔
68	return max(0.0, min(1.0, ratio_over / 0.5))	1✔
69
70	def evaluate(	1✔
71	self,
72	page_data: PageData,
73	result: ClassificationResult,
74	) -> None:
75	"""Evaluate elements and create candidates for step numbers.
76
77	This method scores each text element, attempts to construct StepNumber objects,
78	and stores all candidates with their scores and any failure reasons.
79	"""
80	if not page_data.blocks:	1✔
81	return	1✔
82
83	page_num_height: float \| None = None	1✔
84	# Find the page_number block to use for size comparison
85	labeled_blocks = result.get_labeled_blocks()	1✔
86	for block in page_data.blocks:	1✔
87	if labeled_blocks.get(block) == "page_number":	1✔
88	page_num_height = block.bbox.height	1✔
89	break	1✔
90
91	# Get page bbox and height for bottom band check
92	page_bbox = page_data.bbox	1✔
93	assert page_bbox is not None	1✔
94	page_height = page_bbox.height	1✔
95
96	for block in page_data.blocks:	1✔
97	if not isinstance(block, Text):	1✔
98	continue	1✔
99
100	# Skip blocks in the bottom 10% of the page where page numbers typically appear
101	block_center_y = (block.bbox.y0 + block.bbox.y1) / 2	1✔
102	bottom_threshold = page_bbox.y1 - (page_height * 0.1)	1✔
103	if block_center_y >= bottom_threshold:	1✔
104	continue	1✔
105
106	text_score = self._score_step_number_text(block.text)	1✔
107	if text_score == 0.0:	1✔
108	continue	1✔
109
110	size_score = self._score_step_number_size(block, page_num_height)	1✔
111
112	# If we have a page number for size comparison, require the block to be
113	# taller than the page number (size_score > 0). This prevents small
114	# numeric text from being classified as step numbers.
115	if page_num_height and size_score == 0.0:	1✔
116	continue	1✔
117
118	# Store detailed score object
119	detail_score = _StepNumberScore(	1✔
120	text_score=text_score,
121	size_score=size_score,
122	)
123
124	# Try to construct (parse step number value)
125	value = extract_step_number_value(block.text)	1✔
126	constructed_elem = None	1✔
127	failure_reason = None	1✔
128
129	if value is not None:	1✔
130	constructed_elem = StepNumber(	1✔
131	value=value,
132	bbox=block.bbox,
133	)
134	else:
135	failure_reason = (	×
136	f"Could not parse step number from text: '{block.text}'"
137	)
138
139	# Add candidate
140	result.add_candidate(	1✔
141	"step_number",
142	Candidate(
143	bbox=block.bbox,
144	label="step_number",
145	score=detail_score.combined_score(self.config),
146	score_details=detail_score,
147	constructed=constructed_elem,
148	source_block=block,
149	failure_reason=failure_reason,
150	is_winner=False, # Will be set by classify()
151	),
152	)
153
154	def classify(	1✔
155	self,
156	page_data: PageData,
157	result: ClassificationResult,
158	hints: ClassificationHints \| None,
159	) -> None:
160	"""Select winning step numbers from pre-built candidates."""
161	# Get pre-built candidates
162	candidate_list = result.get_candidates("step_number")	1✔
163
164	# Find the page number block to avoid classifying it as a step number
165	page_number_blocks = result.get_blocks_by_label("page_number")	1✔
166	page_number_block = page_number_blocks[0] if page_number_blocks else None	1✔
167
168	# Mark winners (all successfully constructed candidates that aren't the page number
169	# and meet the confidence threshold)
170	for candidate in candidate_list:	1✔
171	if candidate.source_block is page_number_block:	1✔
172	# Don't classify page number as step number
NEW 173	candidate.failure_reason = "Block is already labeled as page_number"	×
174	continue	×
175
176	if candidate.score < self.config.min_confidence_threshold:	1✔
177	candidate.failure_reason = (	×
178	f"Score {candidate.score:.2f} below threshold "
179	f"{self.config.min_confidence_threshold}"
180	)
181	continue	×
182
183	if candidate.constructed is None:	1✔
184	# Already has failure_reason from calculate_scores
185	continue	×
186
187	# This is a winner!
188	assert isinstance(candidate.constructed, StepNumber)	1✔
189	result.mark_winner(candidate, candidate.constructed)	1✔
190	self.classifier._remove_child_bboxes(	1✔
191	page_data, candidate.source_block, result
192	)
193	self.classifier._remove_similar_bboxes(	1✔
194	page_data, candidate.source_block, result
195	)
196
197	def _score_step_number_text(self, text: str) -> float:	1✔
198	"""Score text based on how well it matches step number patterns.
199
200	Returns:
201	1.0 if text matches step number pattern, 0.0 otherwise
202	"""
203	# Use the extraction function to validate format
204	if extract_step_number_value(text) is not None:	1✔
205	return 1.0	1✔
206	return 0.0	1✔

bramp / build-along / 19155446196

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous