19556130039

Committed 21 Nov 2025 12:48AM UTC coverage: 90.819% (-0.05%) from 90.867%

Build # 19556130039

Build Type

push

github

Committed by

bramp

Commit Message

Updated the golden fixtures.

Run Details

5025 of 5533 relevant lines covered (90.82%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.07

/src/build_a_long/pdf_extract/classifier/step_classifier.py

"""
Step classifier.

Purpose
-------
Identify complete Step structures by combining step_number, parts_list, and diagram
elements. A Step represents a single building instruction comprising:
- A StepNumber label
- An optional PartsList (the parts needed for this step)
- A Diagram (the main instruction graphic showing what to build)

We look for step_numbers and attempt to pair them with nearby parts_lists and
identify the appropriate diagram region for each step.

Debugging
---------
Set environment variables to aid investigation without code changes:

- LOG_LEVEL=DEBUG
    Enables DEBUG-level logging (if not already configured by caller).
"""

import logging
from dataclasses import dataclass

from build_a_long.pdf_extract.classifier.classification_result import (
    Candidate,
    ClassificationResult,
)
from build_a_long.pdf_extract.classifier.label_classifier import (
    LabelClassifier,
)
from build_a_long.pdf_extract.extractor.bbox import BBox
from build_a_long.pdf_extract.extractor.lego_page_elements import (
    Diagram,
    PartsList,
    Step,
    StepNumber,
)

log = logging.getLogger(__name__)


@dataclass
class _StepScore:
    """Internal score representation for step classification."""

    step_number: StepNumber
    """The step number this step is associated with."""

    has_parts_list: bool
    """Whether this step has an associated parts list."""

    step_proximity_score: float
    """Score based on proximity to the PartsList above (0.0-1.0).
    1.0 for closest proximity, 0.0 if very far. 0.0 if no parts list."""

    step_alignment_score: float
    """Score based on left-edge alignment with PartsList above (0.0-1.0).
    1.0 is perfect alignment, 0.0 is very misaligned. 0.0 if no parts list."""

    diagram_area: float
    """Area of the diagram region."""

    def pairing_score(self) -> float:
        """Calculate pairing quality score (average of proximity and alignment)."""
        if not self.has_parts_list:
            return 0.0
        return (self.step_proximity_score + self.step_alignment_score) / 2.0

    def sort_key(self) -> tuple[float, int]:
        """Return a tuple for sorting candidates.

        We prefer:
        1. Higher pairing scores (better StepNumber-PartsList match)
        2. Lower step number values (to break ties and maintain order)
        """
        return (-self.pairing_score(), self.step_number.value)


@dataclass(frozen=True)
class StepClassifier(LabelClassifier):
    """Classifier for complete Step structures."""

    outputs = frozenset({"step"})
    requires = frozenset({"step_number", "parts_list"})

    def evaluate(self, result: ClassificationResult) -> None:
        """Evaluate elements and create deduplicated Step candidates.

        Creates Step candidates for each StepNumber, scoring all possible pairings
        with PartsLists, then greedily selects the best pairings to ensure each
        StepNumber value and PartsList is used at most once.
        """
        page_data = result.page_data

        # Get step numbers and parts lists using score-based selection
        steps = result.get_winners_by_score("step_number", StepNumber)

        if not steps:
            return

        # Get parts_list candidates by score
        parts_lists = result.get_winners_by_score("parts_list", PartsList)

        log.debug(
            "[step] page=%s steps=%d parts_lists=%d",
            page_data.page_number,
            len(steps),
            len(parts_lists),
        )

        # Create all possible Step candidates for pairings
        all_candidates: list[Candidate] = []
        for step_num in steps:
            # Create candidates for this StepNumber paired with each PartsList
            for parts_list in parts_lists:
                candidate = self._create_step_candidate(step_num, parts_list, result)
                if candidate:
                    all_candidates.append(candidate)

            # Also create a candidate with no PartsList (fallback)
            candidate = self._create_step_candidate(step_num, None, result)
            if candidate:
                all_candidates.append(candidate)

        # Greedily select the best candidates (deduplication)
        deduplicated_candidates = self._deduplicate_candidates(all_candidates)

        # Add the deduplicated candidates to the result
        for candidate in deduplicated_candidates:
            result.add_candidate("step", candidate)

        log.debug(
            "[step] Created %d deduplicated step candidates (from %d possibilities)",
            len(deduplicated_candidates),
            len(all_candidates),
        )

    def _create_step_candidate(
        self,
        step_num: StepNumber,
        parts_list: PartsList | None,
        result: ClassificationResult,
    ) -> Candidate | None:
        """Create a Step candidate for a StepNumber paired with a PartsList (or None).

        Args:
            step_num: The StepNumber for this candidate
            parts_list: The PartsList to pair with (or None for no pairing)
            result: Classification result to add the candidate to

        Returns:
            The created Candidate, or None if creation failed
        """
        ABOVE_EPS = 2.0  # Small epsilon for "above" check
        ALIGNMENT_THRESHOLD_MULTIPLIER = 1.0  # Max horizontal offset
        DISTANCE_THRESHOLD_MULTIPLIER = 1.0  # Max vertical distance

        # Calculate pairing scores if there's a parts_list above the step
        proximity_score = 0.0
        alignment_score = 0.0

        if (
            parts_list is not None
            and parts_list.bbox.y1 <= step_num.bbox.y0 + ABOVE_EPS
        ):
            # Calculate distance (how far apart vertically)
            distance = step_num.bbox.y0 - parts_list.bbox.y1

            # Calculate proximity score
            max_distance = step_num.bbox.height * DISTANCE_THRESHOLD_MULTIPLIER
            if max_distance > 0:
                proximity_score = max(0.0, 1.0 - (distance / max_distance))

            # Calculate alignment score (how well left edges align)
            max_alignment_diff = step_num.bbox.width * ALIGNMENT_THRESHOLD_MULTIPLIER
            left_diff = abs(parts_list.bbox.x0 - step_num.bbox.x0)
            if max_alignment_diff > 0:
                alignment_score = max(0.0, 1.0 - (left_diff / max_alignment_diff))

        # Identify diagram region
        diagram_bbox = self._identify_diagram_region(step_num, parts_list, result)

        # Build Step
        diagram = Diagram(bbox=diagram_bbox)
        constructed = Step(
            bbox=self._compute_step_bbox(step_num, parts_list, diagram),
            step_number=step_num,
            parts_list=parts_list or PartsList(bbox=step_num.bbox, parts=[]),
            diagram=diagram,
        )

        # Create score
        score = _StepScore(
            step_number=step_num,
            has_parts_list=parts_list is not None,
            step_proximity_score=proximity_score,
            step_alignment_score=alignment_score,
            diagram_area=diagram_bbox.area,
        )

        # Create candidate
        step_candidate = Candidate(
            bbox=constructed.bbox,
            label="step",
            score=score.pairing_score(),
            score_details=score,
            constructed=constructed,
            source_block=None,
            failure_reason=None,
        )

        return step_candidate

    def _identify_diagram_region(
        self,
        step_num: StepNumber,
        parts_list: PartsList | None,
        result: ClassificationResult,
    ) -> BBox:
        """Identify the diagram region for a step.

        The diagram is typically the large area below the step number and parts list.
        For now, we create a simple heuristic-based region.

        Args:
            step_num: The step number
            parts_list: The associated parts list (if any)
            result: Classification result containing page_data

        Returns:
            BBox representing the diagram region
        """
        page_data = result.page_data
        # Simple heuristic: use the step number's bbox as a starting point
        # In the future, we should look for actual drawing elements below the step

        # Start with step number position
        x0 = step_num.bbox.x0
        y0 = step_num.bbox.y1  # Below the step number

        # If there's a parts list, the diagram should be below it
        if parts_list:
            y0 = max(y0, parts_list.bbox.y1)

        # Extend to a reasonable area (placeholder logic)
        # TODO: Find actual drawing elements and use their bounds
        page_bbox = page_data.bbox
        assert page_bbox is not None

        # Use the rest of the page width and height as a simple approximation
        x1 = page_bbox.x1
        y1 = page_bbox.y1

        # Create a bbox for the diagram region
        return BBox(x0=x0, y0=y0, x1=x1, y1=y1)

    def _compute_step_bbox(
        self,
        step_num: StepNumber,
        parts_list: PartsList | None,
        diagram: Diagram,
    ) -> BBox:
        """Compute the overall bounding box for the Step.

        This encompasses the step number, parts list (if any), and diagram.

        Args:
            step_num: The step number element
            parts_list: The parts list (if any)
            diagram: The diagram element

        Returns:
            Combined bounding box
        """
        bboxes = [step_num.bbox, diagram.bbox]
        if parts_list:
            bboxes.append(parts_list.bbox)

        return BBox.union_all(bboxes)

    def _deduplicate_candidates(self, candidates: list[Candidate]) -> list[Candidate]:
        """Greedily select the best Step candidates.

        Ensures each StepNumber value and each PartsList is used at most once.

        Args:
            candidates: All possible Step candidates

        Returns:
            Deduplicated list of Step candidates
        """
        # Sort candidates by score (highest first)
        sorted_candidates = sorted(
            candidates,
            key=lambda c: c.score_details.sort_key(),
        )

        # Track which StepNumber values and PartsLists have been used
        used_step_values: set[int] = set()
        used_parts_list_ids: set[int] = set()
        selected: list[Candidate] = []

        # Greedily select winners
        for candidate in sorted_candidates:
            if candidate.constructed is None:
                continue

            assert isinstance(candidate.constructed, Step)
            step = candidate.constructed
            step_value = step.step_number.value
            parts_list = step.parts_list if len(step.parts_list.parts) > 0 else None

            # Skip if this step number value is already used
            if step_value in used_step_values:
                log.debug(
                    "[step] Skipping candidate for step %d - value already used",
                    step_value,
                )
                continue

            # Skip if this parts_list is already used (if it has parts)
            if parts_list is not None:
                parts_list_id = id(parts_list)
                if parts_list_id in used_parts_list_ids:
                    log.debug(
                        "[step] Skipping candidate for step %d - "
                        "PartsList already used",
                        step_value,
                    )
                    continue
                # Claim this parts_list
                used_parts_list_ids.add(parts_list_id)

            # Select this candidate
            selected.append(candidate)
            used_step_values.add(step_value)

            log.debug(
                "[step] Selected step %d (parts_list=%s, pairing_score=%.2f)",
                step_value,
                "yes" if parts_list is not None else "no",
                candidate.score_details.pairing_score(),
            )

        return selected

    def classify(self, result: ClassificationResult) -> None:
        """No-op - deduplication is done in evaluate().

        This is part of a refactoring to eliminate the is_winner flag and
        mark_winner() method. Selection logic now happens in evaluate() where
        only the final deduplicated candidates are created.
        """
        pass

1	"""
2	Step classifier.
3
4	Purpose
5	-------
6	Identify complete Step structures by combining step_number, parts_list, and diagram
7	elements. A Step represents a single building instruction comprising:
8	- A StepNumber label
9	- An optional PartsList (the parts needed for this step)
10	- A Diagram (the main instruction graphic showing what to build)
11
12	We look for step_numbers and attempt to pair them with nearby parts_lists and
13	identify the appropriate diagram region for each step.
14
15	Debugging
16	---------
17	Set environment variables to aid investigation without code changes:
18
19	- LOG_LEVEL=DEBUG
20	Enables DEBUG-level logging (if not already configured by caller).
21	"""
22
23	import logging	1✔
24	from dataclasses import dataclass	1✔
25
26	from build_a_long.pdf_extract.classifier.classification_result import (	1✔
27	Candidate,
28	ClassificationResult,
29	)
30	from build_a_long.pdf_extract.classifier.label_classifier import (	1✔
31	LabelClassifier,
32	)
33	from build_a_long.pdf_extract.extractor.bbox import BBox	1✔
34	from build_a_long.pdf_extract.extractor.lego_page_elements import (	1✔
35	Diagram,
36	PartsList,
37	Step,
38	StepNumber,
39	)
40
41	log = logging.getLogger(__name__)	1✔
42
43
44	@dataclass	1✔
45	class _StepScore:	1✔
46	"""Internal score representation for step classification."""
47
48	step_number: StepNumber
49	"""The step number this step is associated with."""	1✔
50
51	has_parts_list: bool
52	"""Whether this step has an associated parts list."""	1✔
53
54	step_proximity_score: float
55	"""Score based on proximity to the PartsList above (0.0-1.0).	1✔
56	1.0 for closest proximity, 0.0 if very far. 0.0 if no parts list."""
57
58	step_alignment_score: float
59	"""Score based on left-edge alignment with PartsList above (0.0-1.0).	1✔
60	1.0 is perfect alignment, 0.0 is very misaligned. 0.0 if no parts list."""
61
62	diagram_area: float
63	"""Area of the diagram region."""	1✔
64
65	def pairing_score(self) -> float:	1✔
66	"""Calculate pairing quality score (average of proximity and alignment)."""
67	if not self.has_parts_list:	1✔
68	return 0.0	1✔
69	return (self.step_proximity_score + self.step_alignment_score) / 2.0	1✔
70
71	def sort_key(self) -> tuple[float, int]:	1✔
72	"""Return a tuple for sorting candidates.
73
74	We prefer:
75	1. Higher pairing scores (better StepNumber-PartsList match)
76	2. Lower step number values (to break ties and maintain order)
77	"""
78	return (-self.pairing_score(), self.step_number.value)	1✔
79
80
81	@dataclass(frozen=True)	1✔
82	class StepClassifier(LabelClassifier):	1✔
83	"""Classifier for complete Step structures."""
84
85	outputs = frozenset({"step"})	1✔
86	requires = frozenset({"step_number", "parts_list"})	1✔
87
88	def evaluate(self, result: ClassificationResult) -> None:	1✔
89	"""Evaluate elements and create deduplicated Step candidates.
90
91	Creates Step candidates for each StepNumber, scoring all possible pairings
92	with PartsLists, then greedily selects the best pairings to ensure each
93	StepNumber value and PartsList is used at most once.
94	"""
95	page_data = result.page_data	1✔
96
97	# Get step numbers and parts lists using score-based selection
98	steps = result.get_winners_by_score("step_number", StepNumber)	1✔
99
100	if not steps:	1✔
101	return	1✔
102
103	# Get parts_list candidates by score
104	parts_lists = result.get_winners_by_score("parts_list", PartsList)	1✔
105
106	log.debug(	1✔
107	"[step] page=%s steps=%d parts_lists=%d",
108	page_data.page_number,
109	len(steps),
110	len(parts_lists),
111	)
112
113	# Create all possible Step candidates for pairings
114	all_candidates: list[Candidate] = []	1✔
115	for step_num in steps:	1✔
116	# Create candidates for this StepNumber paired with each PartsList
117	for parts_list in parts_lists:	1✔
118	candidate = self._create_step_candidate(step_num, parts_list, result)	1✔
119	if candidate:	1✔
120	all_candidates.append(candidate)	1✔
121
122	# Also create a candidate with no PartsList (fallback)
123	candidate = self._create_step_candidate(step_num, None, result)	1✔
124	if candidate:	1✔
125	all_candidates.append(candidate)	1✔
126
127	# Greedily select the best candidates (deduplication)
128	deduplicated_candidates = self._deduplicate_candidates(all_candidates)	1✔
129
130	# Add the deduplicated candidates to the result
131	for candidate in deduplicated_candidates:	1✔
132	result.add_candidate("step", candidate)	1✔
133
134	log.debug(	1✔
135	"[step] Created %d deduplicated step candidates (from %d possibilities)",
136	len(deduplicated_candidates),
137	len(all_candidates),
138	)
139
140	def _create_step_candidate(	1✔
141	self,
142	step_num: StepNumber,
143	parts_list: PartsList \| None,
144	result: ClassificationResult,
145	) -> Candidate \| None:
146	"""Create a Step candidate for a StepNumber paired with a PartsList (or None).
147
148	Args:
149	step_num: The StepNumber for this candidate
150	parts_list: The PartsList to pair with (or None for no pairing)
151	result: Classification result to add the candidate to
152
153	Returns:
154	The created Candidate, or None if creation failed
155	"""
156	ABOVE_EPS = 2.0 # Small epsilon for "above" check	1✔
157	ALIGNMENT_THRESHOLD_MULTIPLIER = 1.0 # Max horizontal offset	1✔
158	DISTANCE_THRESHOLD_MULTIPLIER = 1.0 # Max vertical distance	1✔
159
160	# Calculate pairing scores if there's a parts_list above the step
161	proximity_score = 0.0	1✔
162	alignment_score = 0.0	1✔
163
164	if (	1✔
165	parts_list is not None
166	and parts_list.bbox.y1 <= step_num.bbox.y0 + ABOVE_EPS
167	):
168	# Calculate distance (how far apart vertically)
169	distance = step_num.bbox.y0 - parts_list.bbox.y1	1✔
170
171	# Calculate proximity score
172	max_distance = step_num.bbox.height * DISTANCE_THRESHOLD_MULTIPLIER	1✔
173	if max_distance > 0:	1✔
174	proximity_score = max(0.0, 1.0 - (distance / max_distance))	1✔
175
176	# Calculate alignment score (how well left edges align)
177	max_alignment_diff = step_num.bbox.width * ALIGNMENT_THRESHOLD_MULTIPLIER	1✔
178	left_diff = abs(parts_list.bbox.x0 - step_num.bbox.x0)	1✔
179	if max_alignment_diff > 0:	1✔
180	alignment_score = max(0.0, 1.0 - (left_diff / max_alignment_diff))	1✔
181
182	# Identify diagram region
183	diagram_bbox = self._identify_diagram_region(step_num, parts_list, result)	1✔
184
185	# Build Step
186	diagram = Diagram(bbox=diagram_bbox)	1✔
187	constructed = Step(	1✔
188	bbox=self._compute_step_bbox(step_num, parts_list, diagram),
189	step_number=step_num,
190	parts_list=parts_list or PartsList(bbox=step_num.bbox, parts=[]),
191	diagram=diagram,
192	)
193
194	# Create score
195	score = _StepScore(	1✔
196	step_number=step_num,
197	has_parts_list=parts_list is not None,
198	step_proximity_score=proximity_score,
199	step_alignment_score=alignment_score,
200	diagram_area=diagram_bbox.area,
201	)
202
203	# Create candidate
204	step_candidate = Candidate(	1✔
205	bbox=constructed.bbox,
206	label="step",
207	score=score.pairing_score(),
208	score_details=score,
209	constructed=constructed,
210	source_block=None,
211	failure_reason=None,
212	)
213
214	return step_candidate	1✔
215
216	def _identify_diagram_region(	1✔
217	self,
218	step_num: StepNumber,
219	parts_list: PartsList \| None,
220	result: ClassificationResult,
221	) -> BBox:
222	"""Identify the diagram region for a step.
223
224	The diagram is typically the large area below the step number and parts list.
225	For now, we create a simple heuristic-based region.
226
227	Args:
228	step_num: The step number
229	parts_list: The associated parts list (if any)
230	result: Classification result containing page_data
231
232	Returns:
233	BBox representing the diagram region
234	"""
235	page_data = result.page_data	1✔
236	# Simple heuristic: use the step number's bbox as a starting point
237	# In the future, we should look for actual drawing elements below the step
238
239	# Start with step number position
240	x0 = step_num.bbox.x0	1✔
241	y0 = step_num.bbox.y1 # Below the step number	1✔
242
243	# If there's a parts list, the diagram should be below it
244	if parts_list:	1✔
245	y0 = max(y0, parts_list.bbox.y1)	1✔
246
247	# Extend to a reasonable area (placeholder logic)
248	# TODO: Find actual drawing elements and use their bounds
249	page_bbox = page_data.bbox	1✔
250	assert page_bbox is not None	1✔
251
252	# Use the rest of the page width and height as a simple approximation
253	x1 = page_bbox.x1	1✔
254	y1 = page_bbox.y1	1✔
255
256	# Create a bbox for the diagram region
257	return BBox(x0=x0, y0=y0, x1=x1, y1=y1)	1✔
258
259	def _compute_step_bbox(	1✔
260	self,
261	step_num: StepNumber,
262	parts_list: PartsList \| None,
263	diagram: Diagram,
264	) -> BBox:
265	"""Compute the overall bounding box for the Step.
266
267	This encompasses the step number, parts list (if any), and diagram.
268
269	Args:
270	step_num: The step number element
271	parts_list: The parts list (if any)
272	diagram: The diagram element
273
274	Returns:
275	Combined bounding box
276	"""
277	bboxes = [step_num.bbox, diagram.bbox]	1✔
278	if parts_list:	1✔
279	bboxes.append(parts_list.bbox)	1✔
280
281	return BBox.union_all(bboxes)	1✔
282
283	def _deduplicate_candidates(self, candidates: list[Candidate]) -> list[Candidate]:	1✔
284	"""Greedily select the best Step candidates.
285
286	Ensures each StepNumber value and each PartsList is used at most once.
287
288	Args:
289	candidates: All possible Step candidates
290
291	Returns:
292	Deduplicated list of Step candidates
293	"""
294	# Sort candidates by score (highest first)
295	sorted_candidates = sorted(	1✔
296	candidates,
297	key=lambda c: c.score_details.sort_key(),
298	)
299
300	# Track which StepNumber values and PartsLists have been used
301	used_step_values: set[int] = set()	1✔
302	used_parts_list_ids: set[int] = set()	1✔
303	selected: list[Candidate] = []	1✔
304
305	# Greedily select winners
306	for candidate in sorted_candidates:	1✔
307	if candidate.constructed is None:	1✔
308	continue	×
309
310	assert isinstance(candidate.constructed, Step)	1✔
311	step = candidate.constructed	1✔
312	step_value = step.step_number.value	1✔
313	parts_list = step.parts_list if len(step.parts_list.parts) > 0 else None	1✔
314
315	# Skip if this step number value is already used
316	if step_value in used_step_values:	1✔
317	log.debug(	1✔
318	"[step] Skipping candidate for step %d - value already used",
319	step_value,
320	)
321	continue	1✔
322
323	# Skip if this parts_list is already used (if it has parts)
324	if parts_list is not None:	1✔
325	parts_list_id = id(parts_list)	1✔
326	if parts_list_id in used_parts_list_ids:	1✔
327	log.debug(	1✔
328	"[step] Skipping candidate for step %d - "
329	"PartsList already used",
330	step_value,
331	)
332	continue	1✔
333	# Claim this parts_list
334	used_parts_list_ids.add(parts_list_id)	1✔
335
336	# Select this candidate
337	selected.append(candidate)	1✔
338	used_step_values.add(step_value)	1✔
339
340	log.debug(	1✔
341	"[step] Selected step %d (parts_list=%s, pairing_score=%.2f)",
342	step_value,
343	"yes" if parts_list is not None else "no",
344	candidate.score_details.pairing_score(),
345	)
346
347	return selected	1✔
348
349	def classify(self, result: ClassificationResult) -> None:	1✔
350	"""No-op - deduplication is done in evaluate().
351
352	This is part of a refactoring to eliminate the is_winner flag and
353	mark_winner() method. Selection logic now happens in evaluate() where
354	only the final deduplicated candidates are created.
355	"""
356	pass	1✔

bramp / build-along / 19556130039

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous