19251794703

Committed 11 Nov 2025 01:25AM UTC coverage: 90.748% (+3.9%) from 86.822%

Build # 19251794703

Build Type

push

github

Committed by

bramp

Commit Message

Update golden files to reflect improved parts list classification

The parts_list_max_area_ratio filter now correctly rejects full-page
drawings (bbox: 0,0 to 552.76,496.06) that were previously incorrectly
classified as parts lists.

Updated golden files:
- 6509377_page_015_expected.json: Full-page drawing rejected, now uses
  actual parts list with proper bbox
- 6509377_page_180_expected.json: Full-page drawing rejected, now uses
  actual parts list with proper bbox

These changes reflect the correct behavior where drawings occupying
>75% of the page area are rejected as likely background elements.

Run Details

4708 of 5188 relevant lines covered (90.75%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.05

/src/build_a_long/pdf_extract/classifier/step_classifier.py

"""
Step classifier.

Purpose
-------
Identify complete Step structures by combining step_number, parts_list, and diagram
elements. A Step represents a single building instruction comprising:
- A StepNumber label
- An optional PartsList (the parts needed for this step)
- A Diagram (the main instruction graphic showing what to build)

We look for step_numbers and attempt to pair them with nearby parts_lists and
identify the appropriate diagram region for each step.

Debugging
---------
Set environment variables to aid investigation without code changes:

- LOG_LEVEL=DEBUG
    Enables DEBUG-level logging (if not already configured by caller).
"""

import logging
from dataclasses import dataclass

from build_a_long.pdf_extract.classifier.classification_result import (
    Candidate,
    ClassificationResult,
)
from build_a_long.pdf_extract.classifier.label_classifier import (
    LabelClassifier,
)
from build_a_long.pdf_extract.extractor.bbox import BBox
from build_a_long.pdf_extract.extractor.lego_page_elements import (
    Diagram,
    PartsList,
    Step,
    StepNumber,
)

log = logging.getLogger(__name__)


@dataclass
class _StepScore:
    """Internal score representation for step classification."""

    step_number: StepNumber
    """The step number this step is associated with."""

    has_parts_list: bool
    """Whether this step has an associated parts list."""

    step_proximity_score: float
    """Score based on proximity to the PartsList above (0.0-1.0).
    1.0 for closest proximity, 0.0 if very far. 0.0 if no parts list."""

    step_alignment_score: float
    """Score based on left-edge alignment with PartsList above (0.0-1.0).
    1.0 is perfect alignment, 0.0 is very misaligned. 0.0 if no parts list."""

    diagram_area: float
    """Area of the diagram region."""

    def pairing_score(self) -> float:
        """Calculate pairing quality score (average of proximity and alignment)."""
        if not self.has_parts_list:
            return 0.0
        return (self.step_proximity_score + self.step_alignment_score) / 2.0

    def sort_key(self) -> tuple[float, int]:
        """Return a tuple for sorting candidates.

        We prefer:
        1. Higher pairing scores (better StepNumber-PartsList match)
        2. Lower step number values (to break ties and maintain order)
        """
        return (-self.pairing_score(), self.step_number.value)


@dataclass(frozen=True)
class StepClassifier(LabelClassifier):
    """Classifier for complete Step structures."""

    outputs = frozenset({"step"})
    requires = frozenset({"step_number", "parts_list"})

    def evaluate(self, result: ClassificationResult) -> None:
        """Evaluate elements and create Step candidates for all possible pairings.

        Creates Step candidates for each StepNumber, scoring all possible pairings
        with PartsLists. Candidates are stored in ClassificationResult, and the
        best ones will be selected in classify().
        """
        page_data = result.page_data

        # Get step_number candidates
        step_candidates = result.get_candidates("step_number")
        steps: list[StepNumber] = []

        for candidate in step_candidates:
            if (
                candidate.is_winner
                and candidate.constructed is not None
                and isinstance(candidate.constructed, StepNumber)
            ):
                steps.append(candidate.constructed)

        if not steps:
            return

        # Get parts_list candidates (winners only)
        parts_list_candidates = result.get_candidates("parts_list")
        parts_lists: list[PartsList] = []

        for candidate in parts_list_candidates:
            if (
                candidate.is_winner
                and candidate.constructed is not None
                and isinstance(candidate.constructed, PartsList)
            ):
                parts_lists.append(candidate.constructed)

        log.debug(
            "[step] page=%s steps=%d parts_lists=%d",
            page_data.page_number,
            len(steps),
            len(parts_lists),
        )

        # Create Step candidates for all possible pairings
        for step_num in steps:
            # Create candidates for this StepNumber paired with each PartsList
            for parts_list in parts_lists:
                self._create_step_candidate(step_num, parts_list, result)

            # Also create a candidate with no PartsList (fallback)
            self._create_step_candidate(step_num, None, result)

        log.debug(
            "[step] Created %d step candidates",
            len(result.get_candidates("step")),
        )

    def _create_step_candidate(
        self,
        step_num: StepNumber,
        parts_list: PartsList | None,
        result: ClassificationResult,
    ) -> None:
        """Create a Step candidate for a StepNumber paired with a PartsList (or None).

        Args:
            step_num: The StepNumber for this candidate
            parts_list: The PartsList to pair with (or None for no pairing)
            result: Classification result to add the candidate to
        """
        ABOVE_EPS = 2.0  # Small epsilon for "above" check
        ALIGNMENT_THRESHOLD_MULTIPLIER = 1.0  # Max horizontal offset
        DISTANCE_THRESHOLD_MULTIPLIER = 1.0  # Max vertical distance

        # Calculate pairing scores if there's a parts_list above the step
        proximity_score = 0.0
        alignment_score = 0.0

        if (
            parts_list is not None
            and parts_list.bbox.y1 <= step_num.bbox.y0 + ABOVE_EPS
        ):
            # Calculate distance (how far apart vertically)
            distance = step_num.bbox.y0 - parts_list.bbox.y1

            # Calculate proximity score
            max_distance = step_num.bbox.height * DISTANCE_THRESHOLD_MULTIPLIER
            if max_distance > 0:
                proximity_score = max(0.0, 1.0 - (distance / max_distance))

            # Calculate alignment score (how well left edges align)
            max_alignment_diff = step_num.bbox.width * ALIGNMENT_THRESHOLD_MULTIPLIER
            left_diff = abs(parts_list.bbox.x0 - step_num.bbox.x0)
            if max_alignment_diff > 0:
                alignment_score = max(0.0, 1.0 - (left_diff / max_alignment_diff))

        # Identify diagram region
        diagram_bbox = self._identify_diagram_region(step_num, parts_list, result)

        # Build Step
        diagram = Diagram(bbox=diagram_bbox)
        constructed = Step(
            bbox=self._compute_step_bbox(step_num, parts_list, diagram),
            step_number=step_num,
            parts_list=parts_list or PartsList(bbox=step_num.bbox, parts=[]),
            diagram=diagram,
        )

        # Create score
        score = _StepScore(
            step_number=step_num,
            has_parts_list=parts_list is not None,
            step_proximity_score=proximity_score,
            step_alignment_score=alignment_score,
            diagram_area=diagram_bbox.area,
        )

        # Add candidate (not yet a winner)
        step_candidate = Candidate(
            bbox=constructed.bbox,
            label="step",
            score=score.pairing_score(),
            score_details=score,
            constructed=constructed,
            source_block=None,
            failure_reason=None,
            is_winner=False,
        )

        result.add_candidate("step", step_candidate)

    def _identify_diagram_region(
        self,
        step_num: StepNumber,
        parts_list: PartsList | None,
        result: ClassificationResult,
    ) -> BBox:
        """Identify the diagram region for a step.

        The diagram is typically the large area below the step number and parts list.
        For now, we create a simple heuristic-based region.

        Args:
            step_num: The step number
            parts_list: The associated parts list (if any)
            result: Classification result containing page_data

        Returns:
            BBox representing the diagram region
        """
        page_data = result.page_data
        # Simple heuristic: use the step number's bbox as a starting point
        # In the future, we should look for actual drawing elements below the step

        # Start with step number position
        x0 = step_num.bbox.x0
        y0 = step_num.bbox.y1  # Below the step number

        # If there's a parts list, the diagram should be below it
        if parts_list:
            y0 = max(y0, parts_list.bbox.y1)

        # Extend to a reasonable area (placeholder logic)
        # TODO: Find actual drawing elements and use their bounds
        page_bbox = page_data.bbox
        assert page_bbox is not None

        # Use the rest of the page width and height as a simple approximation
        x1 = page_bbox.x1
        y1 = page_bbox.y1

        # Create a bbox for the diagram region
        return BBox(x0=x0, y0=y0, x1=x1, y1=y1)

    def _compute_step_bbox(
        self,
        step_num: StepNumber,
        parts_list: PartsList | None,
        diagram: Diagram,
    ) -> BBox:
        """Compute the overall bounding box for the Step.

        This encompasses the step number, parts list (if any), and diagram.

        Args:
            step_num: The step number element
            parts_list: The parts list (if any)
            diagram: The diagram element

        Returns:
            Combined bounding box
        """
        bboxes = [step_num.bbox, diagram.bbox]
        if parts_list:
            bboxes.append(parts_list.bbox)

        return BBox.union_all(bboxes)

    def classify(self, result: ClassificationResult) -> None:
        """Greedily select the best Step candidates.

        Uses the candidates created in evaluate() to select the best pairings.
        Ensures each StepNumber value and each PartsList is used at most once.
        """
        # Get all Step candidates
        candidate_list = result.get_candidates("step")

        # Sort candidates by score (highest first)
        sorted_candidates = sorted(
            candidate_list,
            key=lambda c: c.score_details.sort_key(),
        )

        # Track which StepNumber values and PartsLists have been used
        used_step_values: set[int] = set()
        used_parts_list_ids: set[int] = set()

        # Greedily select winners
        for candidate in sorted_candidates:
            if candidate.constructed is None:
                continue

            assert isinstance(candidate.constructed, Step)
            step = candidate.constructed
            step_value = step.step_number.value
            parts_list = step.parts_list if len(step.parts_list.parts) > 0 else None

            # Skip if this step number value is already used
            if step_value in used_step_values:
                log.debug(
                    "[step] Skipping candidate for step %d - value already used",
                    step_value,
                )
                continue

            # Skip if this parts_list is already used (if it has parts)
            if parts_list is not None:
                parts_list_id = id(parts_list)
                if parts_list_id in used_parts_list_ids:
                    log.debug(
                        "[step] Skipping candidate for step %d - "
                        "PartsList already used",
                        step_value,
                    )
                    continue
                # Claim this parts_list
                used_parts_list_ids.add(parts_list_id)

            # Mark this candidate as winner
            result.mark_winner(candidate, step)
            used_step_values.add(step_value)

            log.debug(
                "[step] Marking step %d as winner (parts_list=%s, pairing_score=%.2f)",
                step_value,
                "yes" if parts_list is not None else "no",
                candidate.score_details.pairing_score(),
            )

1	"""
2	Step classifier.
3
4	Purpose
5	-------
6	Identify complete Step structures by combining step_number, parts_list, and diagram
7	elements. A Step represents a single building instruction comprising:
8	- A StepNumber label
9	- An optional PartsList (the parts needed for this step)
10	- A Diagram (the main instruction graphic showing what to build)
11
12	We look for step_numbers and attempt to pair them with nearby parts_lists and
13	identify the appropriate diagram region for each step.
14
15	Debugging
16	---------
17	Set environment variables to aid investigation without code changes:
18
19	- LOG_LEVEL=DEBUG
20	Enables DEBUG-level logging (if not already configured by caller).
21	"""
22
23	import logging	1✔
24	from dataclasses import dataclass	1✔
25
26	from build_a_long.pdf_extract.classifier.classification_result import (	1✔
27	Candidate,
28	ClassificationResult,
29	)
30	from build_a_long.pdf_extract.classifier.label_classifier import (	1✔
31	LabelClassifier,
32	)
33	from build_a_long.pdf_extract.extractor.bbox import BBox	1✔
34	from build_a_long.pdf_extract.extractor.lego_page_elements import (	1✔
35	Diagram,
36	PartsList,
37	Step,
38	StepNumber,
39	)
40
41	log = logging.getLogger(__name__)	1✔
42
43
44	@dataclass	1✔
45	class _StepScore:	1✔
46	"""Internal score representation for step classification."""
47
48	step_number: StepNumber
49	"""The step number this step is associated with."""	1✔
50
51	has_parts_list: bool
52	"""Whether this step has an associated parts list."""	1✔
53
54	step_proximity_score: float
55	"""Score based on proximity to the PartsList above (0.0-1.0).	1✔
56	1.0 for closest proximity, 0.0 if very far. 0.0 if no parts list."""
57
58	step_alignment_score: float
59	"""Score based on left-edge alignment with PartsList above (0.0-1.0).	1✔
60	1.0 is perfect alignment, 0.0 is very misaligned. 0.0 if no parts list."""
61
62	diagram_area: float
63	"""Area of the diagram region."""	1✔
64
65	def pairing_score(self) -> float:	1✔
66	"""Calculate pairing quality score (average of proximity and alignment)."""
67	if not self.has_parts_list:	1✔
68	return 0.0	1✔
69	return (self.step_proximity_score + self.step_alignment_score) / 2.0	1✔
70
71	def sort_key(self) -> tuple[float, int]:	1✔
72	"""Return a tuple for sorting candidates.
73
74	We prefer:
75	1. Higher pairing scores (better StepNumber-PartsList match)
76	2. Lower step number values (to break ties and maintain order)
77	"""
78	return (-self.pairing_score(), self.step_number.value)	1✔
79
80
81	@dataclass(frozen=True)	1✔
82	class StepClassifier(LabelClassifier):	1✔
83	"""Classifier for complete Step structures."""
84
85	outputs = frozenset({"step"})	1✔
86	requires = frozenset({"step_number", "parts_list"})	1✔
87
88	def evaluate(self, result: ClassificationResult) -> None:	1✔
89	"""Evaluate elements and create Step candidates for all possible pairings.
90
91	Creates Step candidates for each StepNumber, scoring all possible pairings
92	with PartsLists. Candidates are stored in ClassificationResult, and the
93	best ones will be selected in classify().
94	"""
95	page_data = result.page_data	1✔
96
97	# Get step_number candidates
98	step_candidates = result.get_candidates("step_number")	1✔
99	steps: list[StepNumber] = []	1✔
100
101	for candidate in step_candidates:	1✔
102	if (	1✔
103	candidate.is_winner
104	and candidate.constructed is not None
105	and isinstance(candidate.constructed, StepNumber)
106	):
107	steps.append(candidate.constructed)	1✔
108
109	if not steps:	1✔
110	return	1✔
111
112	# Get parts_list candidates (winners only)
113	parts_list_candidates = result.get_candidates("parts_list")	1✔
114	parts_lists: list[PartsList] = []	1✔
115
116	for candidate in parts_list_candidates:	1✔
117	if (	1✔
118	candidate.is_winner
119	and candidate.constructed is not None
120	and isinstance(candidate.constructed, PartsList)
121	):
122	parts_lists.append(candidate.constructed)	1✔
123
124	log.debug(	1✔
125	"[step] page=%s steps=%d parts_lists=%d",
126	page_data.page_number,
127	len(steps),
128	len(parts_lists),
129	)
130
131	# Create Step candidates for all possible pairings
132	for step_num in steps:	1✔
133	# Create candidates for this StepNumber paired with each PartsList
134	for parts_list in parts_lists:	1✔
135	self._create_step_candidate(step_num, parts_list, result)	1✔
136
137	# Also create a candidate with no PartsList (fallback)
138	self._create_step_candidate(step_num, None, result)	1✔
139
140	log.debug(	1✔
141	"[step] Created %d step candidates",
142	len(result.get_candidates("step")),
143	)
144
145	def _create_step_candidate(	1✔
146	self,
147	step_num: StepNumber,
148	parts_list: PartsList \| None,
149	result: ClassificationResult,
150	) -> None:
151	"""Create a Step candidate for a StepNumber paired with a PartsList (or None).
152
153	Args:
154	step_num: The StepNumber for this candidate
155	parts_list: The PartsList to pair with (or None for no pairing)
156	result: Classification result to add the candidate to
157	"""
158	ABOVE_EPS = 2.0 # Small epsilon for "above" check	1✔
159	ALIGNMENT_THRESHOLD_MULTIPLIER = 1.0 # Max horizontal offset	1✔
160	DISTANCE_THRESHOLD_MULTIPLIER = 1.0 # Max vertical distance	1✔
161
162	# Calculate pairing scores if there's a parts_list above the step
163	proximity_score = 0.0	1✔
164	alignment_score = 0.0	1✔
165
166	if (	1✔
167	parts_list is not None
168	and parts_list.bbox.y1 <= step_num.bbox.y0 + ABOVE_EPS
169	):
170	# Calculate distance (how far apart vertically)
171	distance = step_num.bbox.y0 - parts_list.bbox.y1	1✔
172
173	# Calculate proximity score
174	max_distance = step_num.bbox.height * DISTANCE_THRESHOLD_MULTIPLIER	1✔
175	if max_distance > 0:	1✔
176	proximity_score = max(0.0, 1.0 - (distance / max_distance))	1✔
177
178	# Calculate alignment score (how well left edges align)
179	max_alignment_diff = step_num.bbox.width * ALIGNMENT_THRESHOLD_MULTIPLIER	1✔
180	left_diff = abs(parts_list.bbox.x0 - step_num.bbox.x0)	1✔
181	if max_alignment_diff > 0:	1✔
182	alignment_score = max(0.0, 1.0 - (left_diff / max_alignment_diff))	1✔
183
184	# Identify diagram region
185	diagram_bbox = self._identify_diagram_region(step_num, parts_list, result)	1✔
186
187	# Build Step
188	diagram = Diagram(bbox=diagram_bbox)	1✔
189	constructed = Step(	1✔
190	bbox=self._compute_step_bbox(step_num, parts_list, diagram),
191	step_number=step_num,
192	parts_list=parts_list or PartsList(bbox=step_num.bbox, parts=[]),
193	diagram=diagram,
194	)
195
196	# Create score
197	score = _StepScore(	1✔
198	step_number=step_num,
199	has_parts_list=parts_list is not None,
200	step_proximity_score=proximity_score,
201	step_alignment_score=alignment_score,
202	diagram_area=diagram_bbox.area,
203	)
204
205	# Add candidate (not yet a winner)
206	step_candidate = Candidate(	1✔
207	bbox=constructed.bbox,
208	label="step",
209	score=score.pairing_score(),
210	score_details=score,
211	constructed=constructed,
212	source_block=None,
213	failure_reason=None,
214	is_winner=False,
215	)
216
217	result.add_candidate("step", step_candidate)	1✔
218
219	def _identify_diagram_region(	1✔
220	self,
221	step_num: StepNumber,
222	parts_list: PartsList \| None,
223	result: ClassificationResult,
224	) -> BBox:
225	"""Identify the diagram region for a step.
226
227	The diagram is typically the large area below the step number and parts list.
228	For now, we create a simple heuristic-based region.
229
230	Args:
231	step_num: The step number
232	parts_list: The associated parts list (if any)
233	result: Classification result containing page_data
234
235	Returns:
236	BBox representing the diagram region
237	"""
238	page_data = result.page_data	1✔
239	# Simple heuristic: use the step number's bbox as a starting point
240	# In the future, we should look for actual drawing elements below the step
241
242	# Start with step number position
243	x0 = step_num.bbox.x0	1✔
244	y0 = step_num.bbox.y1 # Below the step number	1✔
245
246	# If there's a parts list, the diagram should be below it
247	if parts_list:	1✔
248	y0 = max(y0, parts_list.bbox.y1)	1✔
249
250	# Extend to a reasonable area (placeholder logic)
251	# TODO: Find actual drawing elements and use their bounds
252	page_bbox = page_data.bbox	1✔
253	assert page_bbox is not None	1✔
254
255	# Use the rest of the page width and height as a simple approximation
256	x1 = page_bbox.x1	1✔
257	y1 = page_bbox.y1	1✔
258
259	# Create a bbox for the diagram region
260	return BBox(x0=x0, y0=y0, x1=x1, y1=y1)	1✔
261
262	def _compute_step_bbox(	1✔
263	self,
264	step_num: StepNumber,
265	parts_list: PartsList \| None,
266	diagram: Diagram,
267	) -> BBox:
268	"""Compute the overall bounding box for the Step.
269
270	This encompasses the step number, parts list (if any), and diagram.
271
272	Args:
273	step_num: The step number element
274	parts_list: The parts list (if any)
275	diagram: The diagram element
276
277	Returns:
278	Combined bounding box
279	"""
280	bboxes = [step_num.bbox, diagram.bbox]	1✔
281	if parts_list:	1✔
282	bboxes.append(parts_list.bbox)	1✔
283
284	return BBox.union_all(bboxes)	1✔
285
286	def classify(self, result: ClassificationResult) -> None:	1✔
287	"""Greedily select the best Step candidates.
288
289	Uses the candidates created in evaluate() to select the best pairings.
290	Ensures each StepNumber value and each PartsList is used at most once.
291	"""
292	# Get all Step candidates
293	candidate_list = result.get_candidates("step")	1✔
294
295	# Sort candidates by score (highest first)
296	sorted_candidates = sorted(	1✔
297	candidate_list,
298	key=lambda c: c.score_details.sort_key(),
299	)
300
301	# Track which StepNumber values and PartsLists have been used
302	used_step_values: set[int] = set()	1✔
303	used_parts_list_ids: set[int] = set()	1✔
304
305	# Greedily select winners
306	for candidate in sorted_candidates:	1✔
307	if candidate.constructed is None:	1✔
308	continue	×
309
310	assert isinstance(candidate.constructed, Step)	1✔
311	step = candidate.constructed	1✔
312	step_value = step.step_number.value	1✔
313	parts_list = step.parts_list if len(step.parts_list.parts) > 0 else None	1✔
314
315	# Skip if this step number value is already used
316	if step_value in used_step_values:	1✔
317	log.debug(	1✔
318	"[step] Skipping candidate for step %d - value already used",
319	step_value,
320	)
321	continue	1✔
322
323	# Skip if this parts_list is already used (if it has parts)
324	if parts_list is not None:	1✔
325	parts_list_id = id(parts_list)	1✔
326	if parts_list_id in used_parts_list_ids:	1✔
327	log.debug(	1✔
328	"[step] Skipping candidate for step %d - "
329	"PartsList already used",
330	step_value,
331	)
332	continue	1✔
333	# Claim this parts_list
334	used_parts_list_ids.add(parts_list_id)	1✔
335
336	# Mark this candidate as winner
337	result.mark_winner(candidate, step)	1✔
338	used_step_values.add(step_value)	1✔
339
340	log.debug(	1✔
341	"[step] Marking step %d as winner (parts_list=%s, pairing_score=%.2f)",
342	step_value,
343	"yes" if parts_list is not None else "no",
344	candidate.score_details.pairing_score(),
345	)

bramp / build-along / 19251794703

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous