19257583787

Committed 11 Nov 2025 06:52AM UTC coverage: 91.217% (+0.5%) from 90.748%

Build # 19257583787

Build Type

push

github

Committed by

bramp

Commit Message

feat(pdf_extract): Update lego_page_layout tool

- Add support for ProgressBar and PartNumber elements.
- Remove NewBag and BagNumber from the example.
- Adjust ProgressBar to be left-aligned with steps and have a margin.
- Regenerate the layout diagram.

Run Details

4923 of 5397 relevant lines covered (91.22%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.97

/src/build_a_long/pdf_extract/classifier/step_classifier.py

"""
Step classifier.

Purpose
-------
Identify complete Step structures by combining step_number, parts_list, and diagram
elements. A Step represents a single building instruction comprising:
- A StepNumber label
- An optional PartsList (the parts needed for this step)
- A Diagram (the main instruction graphic showing what to build)

We look for step_numbers and attempt to pair them with nearby parts_lists and
identify the appropriate diagram region for each step.

Debugging
---------
Set environment variables to aid investigation without code changes:

- LOG_LEVEL=DEBUG
    Enables DEBUG-level logging (if not already configured by caller).
"""

import logging
from dataclasses import dataclass

from build_a_long.pdf_extract.classifier.classification_result import (
    Candidate,
    ClassificationResult,
)
from build_a_long.pdf_extract.classifier.label_classifier import (
    LabelClassifier,
)
from build_a_long.pdf_extract.extractor.bbox import BBox
from build_a_long.pdf_extract.extractor.lego_page_elements import (
    Diagram,
    PartsList,
    Step,
    StepNumber,
)

log = logging.getLogger(__name__)


@dataclass
class _StepScore:
    """Internal score representation for step classification."""

    step_number: StepNumber
    """The step number this step is associated with."""

    has_parts_list: bool
    """Whether this step has an associated parts list."""

    step_proximity_score: float
    """Score based on proximity to the PartsList above (0.0-1.0).
    1.0 for closest proximity, 0.0 if very far. 0.0 if no parts list."""

    step_alignment_score: float
    """Score based on left-edge alignment with PartsList above (0.0-1.0).
    1.0 is perfect alignment, 0.0 is very misaligned. 0.0 if no parts list."""

    diagram_area: float
    """Area of the diagram region."""

    def pairing_score(self) -> float:
        """Calculate pairing quality score (average of proximity and alignment)."""
        if not self.has_parts_list:
            return 0.0
        return (self.step_proximity_score + self.step_alignment_score) / 2.0

    def sort_key(self) -> tuple[float, int]:
        """Return a tuple for sorting candidates.

        We prefer:
        1. Higher pairing scores (better StepNumber-PartsList match)
        2. Lower step number values (to break ties and maintain order)
        """
        return (-self.pairing_score(), self.step_number.value)


@dataclass(frozen=True)
class StepClassifier(LabelClassifier):
    """Classifier for complete Step structures."""

    outputs = frozenset({"step"})
    requires = frozenset({"step_number", "parts_list"})

    def evaluate(self, result: ClassificationResult) -> None:
        """Evaluate elements and create Step candidates for all possible pairings.

        Creates Step candidates for each StepNumber, scoring all possible pairings
        with PartsLists. Candidates are stored in ClassificationResult, and the
        best ones will be selected in classify().
        """
        page_data = result.page_data

        # Get winners with type safety
        steps = result.get_winners("step_number", StepNumber)

        if not steps:
            return

        # Get parts_list winners
        parts_lists = result.get_winners("parts_list", PartsList)

        log.debug(
            "[step] page=%s steps=%d parts_lists=%d",
            page_data.page_number,
            len(steps),
            len(parts_lists),
        )

        # Create Step candidates for all possible pairings
        for step_num in steps:
            # Create candidates for this StepNumber paired with each PartsList
            for parts_list in parts_lists:
                self._create_step_candidate(step_num, parts_list, result)

            # Also create a candidate with no PartsList (fallback)
            self._create_step_candidate(step_num, None, result)

        log.debug(
            "[step] Created %d step candidates",
            len(result.get_candidates("step")),
        )

    def _create_step_candidate(
        self,
        step_num: StepNumber,
        parts_list: PartsList | None,
        result: ClassificationResult,
    ) -> None:
        """Create a Step candidate for a StepNumber paired with a PartsList (or None).

        Args:
            step_num: The StepNumber for this candidate
            parts_list: The PartsList to pair with (or None for no pairing)
            result: Classification result to add the candidate to
        """
        ABOVE_EPS = 2.0  # Small epsilon for "above" check
        ALIGNMENT_THRESHOLD_MULTIPLIER = 1.0  # Max horizontal offset
        DISTANCE_THRESHOLD_MULTIPLIER = 1.0  # Max vertical distance

        # Calculate pairing scores if there's a parts_list above the step
        proximity_score = 0.0
        alignment_score = 0.0

        if (
            parts_list is not None
            and parts_list.bbox.y1 <= step_num.bbox.y0 + ABOVE_EPS
        ):
            # Calculate distance (how far apart vertically)
            distance = step_num.bbox.y0 - parts_list.bbox.y1

            # Calculate proximity score
            max_distance = step_num.bbox.height * DISTANCE_THRESHOLD_MULTIPLIER
            if max_distance > 0:
                proximity_score = max(0.0, 1.0 - (distance / max_distance))

            # Calculate alignment score (how well left edges align)
            max_alignment_diff = step_num.bbox.width * ALIGNMENT_THRESHOLD_MULTIPLIER
            left_diff = abs(parts_list.bbox.x0 - step_num.bbox.x0)
            if max_alignment_diff > 0:
                alignment_score = max(0.0, 1.0 - (left_diff / max_alignment_diff))

        # Identify diagram region
        diagram_bbox = self._identify_diagram_region(step_num, parts_list, result)

        # Build Step
        diagram = Diagram(bbox=diagram_bbox)
        constructed = Step(
            bbox=self._compute_step_bbox(step_num, parts_list, diagram),
            step_number=step_num,
            parts_list=parts_list or PartsList(bbox=step_num.bbox, parts=[]),
            diagram=diagram,
        )

        # Create score
        score = _StepScore(
            step_number=step_num,
            has_parts_list=parts_list is not None,
            step_proximity_score=proximity_score,
            step_alignment_score=alignment_score,
            diagram_area=diagram_bbox.area,
        )

        # Add candidate (not yet a winner)
        step_candidate = Candidate(
            bbox=constructed.bbox,
            label="step",
            score=score.pairing_score(),
            score_details=score,
            constructed=constructed,
            source_block=None,
            failure_reason=None,
            is_winner=False,
        )

        result.add_candidate("step", step_candidate)

    def _identify_diagram_region(
        self,
        step_num: StepNumber,
        parts_list: PartsList | None,
        result: ClassificationResult,
    ) -> BBox:
        """Identify the diagram region for a step.

        The diagram is typically the large area below the step number and parts list.
        For now, we create a simple heuristic-based region.

        Args:
            step_num: The step number
            parts_list: The associated parts list (if any)
            result: Classification result containing page_data

        Returns:
            BBox representing the diagram region
        """
        page_data = result.page_data
        # Simple heuristic: use the step number's bbox as a starting point
        # In the future, we should look for actual drawing elements below the step

        # Start with step number position
        x0 = step_num.bbox.x0
        y0 = step_num.bbox.y1  # Below the step number

        # If there's a parts list, the diagram should be below it
        if parts_list:
            y0 = max(y0, parts_list.bbox.y1)

        # Extend to a reasonable area (placeholder logic)
        # TODO: Find actual drawing elements and use their bounds
        page_bbox = page_data.bbox
        assert page_bbox is not None

        # Use the rest of the page width and height as a simple approximation
        x1 = page_bbox.x1
        y1 = page_bbox.y1

        # Create a bbox for the diagram region
        return BBox(x0=x0, y0=y0, x1=x1, y1=y1)

    def _compute_step_bbox(
        self,
        step_num: StepNumber,
        parts_list: PartsList | None,
        diagram: Diagram,
    ) -> BBox:
        """Compute the overall bounding box for the Step.

        This encompasses the step number, parts list (if any), and diagram.

        Args:
            step_num: The step number element
            parts_list: The parts list (if any)
            diagram: The diagram element

        Returns:
            Combined bounding box
        """
        bboxes = [step_num.bbox, diagram.bbox]
        if parts_list:
            bboxes.append(parts_list.bbox)

        return BBox.union_all(bboxes)

    def classify(self, result: ClassificationResult) -> None:
        """Greedily select the best Step candidates.

        Uses the candidates created in evaluate() to select the best pairings.
        Ensures each StepNumber value and each PartsList is used at most once.
        """
        # Get all Step candidates
        candidate_list = result.get_candidates("step")

        # Sort candidates by score (highest first)
        sorted_candidates = sorted(
            candidate_list,
            key=lambda c: c.score_details.sort_key(),
        )

        # Track which StepNumber values and PartsLists have been used
        used_step_values: set[int] = set()
        used_parts_list_ids: set[int] = set()

        # Greedily select winners
        for candidate in sorted_candidates:
            if candidate.constructed is None:
                continue

            assert isinstance(candidate.constructed, Step)
            step = candidate.constructed
            step_value = step.step_number.value
            parts_list = step.parts_list if len(step.parts_list.parts) > 0 else None

            # Skip if this step number value is already used
            if step_value in used_step_values:
                log.debug(
                    "[step] Skipping candidate for step %d - value already used",
                    step_value,
                )
                continue

            # Skip if this parts_list is already used (if it has parts)
            if parts_list is not None:
                parts_list_id = id(parts_list)
                if parts_list_id in used_parts_list_ids:
                    log.debug(
                        "[step] Skipping candidate for step %d - "
                        "PartsList already used",
                        step_value,
                    )
                    continue
                # Claim this parts_list
                used_parts_list_ids.add(parts_list_id)

            # Mark this candidate as winner
            result.mark_winner(candidate, step)
            used_step_values.add(step_value)

            log.debug(
                "[step] Marking step %d as winner (parts_list=%s, pairing_score=%.2f)",
                step_value,
                "yes" if parts_list is not None else "no",
                candidate.score_details.pairing_score(),
            )

1	"""
2	Step classifier.
3
4	Purpose
5	-------
6	Identify complete Step structures by combining step_number, parts_list, and diagram
7	elements. A Step represents a single building instruction comprising:
8	- A StepNumber label
9	- An optional PartsList (the parts needed for this step)
10	- A Diagram (the main instruction graphic showing what to build)
11
12	We look for step_numbers and attempt to pair them with nearby parts_lists and
13	identify the appropriate diagram region for each step.
14
15	Debugging
16	---------
17	Set environment variables to aid investigation without code changes:
18
19	- LOG_LEVEL=DEBUG
20	Enables DEBUG-level logging (if not already configured by caller).
21	"""
22
23	import logging	1✔
24	from dataclasses import dataclass	1✔
25
26	from build_a_long.pdf_extract.classifier.classification_result import (	1✔
27	Candidate,
28	ClassificationResult,
29	)
30	from build_a_long.pdf_extract.classifier.label_classifier import (	1✔
31	LabelClassifier,
32	)
33	from build_a_long.pdf_extract.extractor.bbox import BBox	1✔
34	from build_a_long.pdf_extract.extractor.lego_page_elements import (	1✔
35	Diagram,
36	PartsList,
37	Step,
38	StepNumber,
39	)
40
41	log = logging.getLogger(__name__)	1✔
42
43
44	@dataclass	1✔
45	class _StepScore:	1✔
46	"""Internal score representation for step classification."""
47
48	step_number: StepNumber
49	"""The step number this step is associated with."""	1✔
50
51	has_parts_list: bool
52	"""Whether this step has an associated parts list."""	1✔
53
54	step_proximity_score: float
55	"""Score based on proximity to the PartsList above (0.0-1.0).	1✔
56	1.0 for closest proximity, 0.0 if very far. 0.0 if no parts list."""
57
58	step_alignment_score: float
59	"""Score based on left-edge alignment with PartsList above (0.0-1.0).	1✔
60	1.0 is perfect alignment, 0.0 is very misaligned. 0.0 if no parts list."""
61
62	diagram_area: float
63	"""Area of the diagram region."""	1✔
64
65	def pairing_score(self) -> float:	1✔
66	"""Calculate pairing quality score (average of proximity and alignment)."""
67	if not self.has_parts_list:	1✔
68	return 0.0	1✔
69	return (self.step_proximity_score + self.step_alignment_score) / 2.0	1✔
70
71	def sort_key(self) -> tuple[float, int]:	1✔
72	"""Return a tuple for sorting candidates.
73
74	We prefer:
75	1. Higher pairing scores (better StepNumber-PartsList match)
76	2. Lower step number values (to break ties and maintain order)
77	"""
78	return (-self.pairing_score(), self.step_number.value)	1✔
79
80
81	@dataclass(frozen=True)	1✔
82	class StepClassifier(LabelClassifier):	1✔
83	"""Classifier for complete Step structures."""
84
85	outputs = frozenset({"step"})	1✔
86	requires = frozenset({"step_number", "parts_list"})	1✔
87
88	def evaluate(self, result: ClassificationResult) -> None:	1✔
89	"""Evaluate elements and create Step candidates for all possible pairings.
90
91	Creates Step candidates for each StepNumber, scoring all possible pairings
92	with PartsLists. Candidates are stored in ClassificationResult, and the
93	best ones will be selected in classify().
94	"""
95	page_data = result.page_data	1✔
96
97	# Get winners with type safety
98	steps = result.get_winners("step_number", StepNumber)	1✔
99
100	if not steps:	1✔
101	return	1✔
102
103	# Get parts_list winners
104	parts_lists = result.get_winners("parts_list", PartsList)	1✔
105
106	log.debug(	1✔
107	"[step] page=%s steps=%d parts_lists=%d",
108	page_data.page_number,
109	len(steps),
110	len(parts_lists),
111	)
112
113	# Create Step candidates for all possible pairings
114	for step_num in steps:	1✔
115	# Create candidates for this StepNumber paired with each PartsList
116	for parts_list in parts_lists:	1✔
117	self._create_step_candidate(step_num, parts_list, result)	1✔
118
119	# Also create a candidate with no PartsList (fallback)
120	self._create_step_candidate(step_num, None, result)	1✔
121
122	log.debug(	1✔
123	"[step] Created %d step candidates",
124	len(result.get_candidates("step")),
125	)
126
127	def _create_step_candidate(	1✔
128	self,
129	step_num: StepNumber,
130	parts_list: PartsList \| None,
131	result: ClassificationResult,
132	) -> None:
133	"""Create a Step candidate for a StepNumber paired with a PartsList (or None).
134
135	Args:
136	step_num: The StepNumber for this candidate
137	parts_list: The PartsList to pair with (or None for no pairing)
138	result: Classification result to add the candidate to
139	"""
140	ABOVE_EPS = 2.0 # Small epsilon for "above" check	1✔
141	ALIGNMENT_THRESHOLD_MULTIPLIER = 1.0 # Max horizontal offset	1✔
142	DISTANCE_THRESHOLD_MULTIPLIER = 1.0 # Max vertical distance	1✔
143
144	# Calculate pairing scores if there's a parts_list above the step
145	proximity_score = 0.0	1✔
146	alignment_score = 0.0	1✔
147
148	if (	1✔
149	parts_list is not None
150	and parts_list.bbox.y1 <= step_num.bbox.y0 + ABOVE_EPS
151	):
152	# Calculate distance (how far apart vertically)
153	distance = step_num.bbox.y0 - parts_list.bbox.y1	1✔
154
155	# Calculate proximity score
156	max_distance = step_num.bbox.height * DISTANCE_THRESHOLD_MULTIPLIER	1✔
157	if max_distance > 0:	1✔
158	proximity_score = max(0.0, 1.0 - (distance / max_distance))	1✔
159
160	# Calculate alignment score (how well left edges align)
161	max_alignment_diff = step_num.bbox.width * ALIGNMENT_THRESHOLD_MULTIPLIER	1✔
162	left_diff = abs(parts_list.bbox.x0 - step_num.bbox.x0)	1✔
163	if max_alignment_diff > 0:	1✔
164	alignment_score = max(0.0, 1.0 - (left_diff / max_alignment_diff))	1✔
165
166	# Identify diagram region
167	diagram_bbox = self._identify_diagram_region(step_num, parts_list, result)	1✔
168
169	# Build Step
170	diagram = Diagram(bbox=diagram_bbox)	1✔
171	constructed = Step(	1✔
172	bbox=self._compute_step_bbox(step_num, parts_list, diagram),
173	step_number=step_num,
174	parts_list=parts_list or PartsList(bbox=step_num.bbox, parts=[]),
175	diagram=diagram,
176	)
177
178	# Create score
179	score = _StepScore(	1✔
180	step_number=step_num,
181	has_parts_list=parts_list is not None,
182	step_proximity_score=proximity_score,
183	step_alignment_score=alignment_score,
184	diagram_area=diagram_bbox.area,
185	)
186
187	# Add candidate (not yet a winner)
188	step_candidate = Candidate(	1✔
189	bbox=constructed.bbox,
190	label="step",
191	score=score.pairing_score(),
192	score_details=score,
193	constructed=constructed,
194	source_block=None,
195	failure_reason=None,
196	is_winner=False,
197	)
198
199	result.add_candidate("step", step_candidate)	1✔
200
201	def _identify_diagram_region(	1✔
202	self,
203	step_num: StepNumber,
204	parts_list: PartsList \| None,
205	result: ClassificationResult,
206	) -> BBox:
207	"""Identify the diagram region for a step.
208
209	The diagram is typically the large area below the step number and parts list.
210	For now, we create a simple heuristic-based region.
211
212	Args:
213	step_num: The step number
214	parts_list: The associated parts list (if any)
215	result: Classification result containing page_data
216
217	Returns:
218	BBox representing the diagram region
219	"""
220	page_data = result.page_data	1✔
221	# Simple heuristic: use the step number's bbox as a starting point
222	# In the future, we should look for actual drawing elements below the step
223
224	# Start with step number position
225	x0 = step_num.bbox.x0	1✔
226	y0 = step_num.bbox.y1 # Below the step number	1✔
227
228	# If there's a parts list, the diagram should be below it
229	if parts_list:	1✔
230	y0 = max(y0, parts_list.bbox.y1)	1✔
231
232	# Extend to a reasonable area (placeholder logic)
233	# TODO: Find actual drawing elements and use their bounds
234	page_bbox = page_data.bbox	1✔
235	assert page_bbox is not None	1✔
236
237	# Use the rest of the page width and height as a simple approximation
238	x1 = page_bbox.x1	1✔
239	y1 = page_bbox.y1	1✔
240
241	# Create a bbox for the diagram region
242	return BBox(x0=x0, y0=y0, x1=x1, y1=y1)	1✔
243
244	def _compute_step_bbox(	1✔
245	self,
246	step_num: StepNumber,
247	parts_list: PartsList \| None,
248	diagram: Diagram,
249	) -> BBox:
250	"""Compute the overall bounding box for the Step.
251
252	This encompasses the step number, parts list (if any), and diagram.
253
254	Args:
255	step_num: The step number element
256	parts_list: The parts list (if any)
257	diagram: The diagram element
258
259	Returns:
260	Combined bounding box
261	"""
262	bboxes = [step_num.bbox, diagram.bbox]	1✔
263	if parts_list:	1✔
264	bboxes.append(parts_list.bbox)	1✔
265
266	return BBox.union_all(bboxes)	1✔
267
268	def classify(self, result: ClassificationResult) -> None:	1✔
269	"""Greedily select the best Step candidates.
270
271	Uses the candidates created in evaluate() to select the best pairings.
272	Ensures each StepNumber value and each PartsList is used at most once.
273	"""
274	# Get all Step candidates
275	candidate_list = result.get_candidates("step")	1✔
276
277	# Sort candidates by score (highest first)
278	sorted_candidates = sorted(	1✔
279	candidate_list,
280	key=lambda c: c.score_details.sort_key(),
281	)
282
283	# Track which StepNumber values and PartsLists have been used
284	used_step_values: set[int] = set()	1✔
285	used_parts_list_ids: set[int] = set()	1✔
286
287	# Greedily select winners
288	for candidate in sorted_candidates:	1✔
289	if candidate.constructed is None:	1✔
290	continue	×
291
292	assert isinstance(candidate.constructed, Step)	1✔
293	step = candidate.constructed	1✔
294	step_value = step.step_number.value	1✔
295	parts_list = step.parts_list if len(step.parts_list.parts) > 0 else None	1✔
296
297	# Skip if this step number value is already used
298	if step_value in used_step_values:	1✔
299	log.debug(	1✔
300	"[step] Skipping candidate for step %d - value already used",
301	step_value,
302	)
303	continue	1✔
304
305	# Skip if this parts_list is already used (if it has parts)
306	if parts_list is not None:	1✔
307	parts_list_id = id(parts_list)	1✔
308	if parts_list_id in used_parts_list_ids:	1✔
309	log.debug(	1✔
310	"[step] Skipping candidate for step %d - "
311	"PartsList already used",
312	step_value,
313	)
314	continue	1✔
315	# Claim this parts_list
316	used_parts_list_ids.add(parts_list_id)	1✔
317
318	# Mark this candidate as winner
319	result.mark_winner(candidate, step)	1✔
320	used_step_values.add(step_value)	1✔
321
322	log.debug(	1✔
323	"[step] Marking step %d as winner (parts_list=%s, pairing_score=%.2f)",
324	step_value,
325	"yes" if parts_list is not None else "no",
326	candidate.score_details.pairing_score(),
327	)

bramp / build-along / 19257583787

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous