• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19556130039

21 Nov 2025 12:48AM UTC coverage: 90.819% (-0.05%) from 90.867%
19556130039

push

github

bramp
Updated the golden fixtures.

5025 of 5533 relevant lines covered (90.82%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.07
/src/build_a_long/pdf_extract/classifier/step_classifier.py
1
"""
2
Step classifier.
3

4
Purpose
5
-------
6
Identify complete Step structures by combining step_number, parts_list, and diagram
7
elements. A Step represents a single building instruction comprising:
8
- A StepNumber label
9
- An optional PartsList (the parts needed for this step)
10
- A Diagram (the main instruction graphic showing what to build)
11

12
We look for step_numbers and attempt to pair them with nearby parts_lists and
13
identify the appropriate diagram region for each step.
14

15
Debugging
16
---------
17
Set environment variables to aid investigation without code changes:
18

19
- LOG_LEVEL=DEBUG
20
    Enables DEBUG-level logging (if not already configured by caller).
21
"""
22

23
import logging
1✔
24
from dataclasses import dataclass
1✔
25

26
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
27
    Candidate,
28
    ClassificationResult,
29
)
30
from build_a_long.pdf_extract.classifier.label_classifier import (
1✔
31
    LabelClassifier,
32
)
33
from build_a_long.pdf_extract.extractor.bbox import BBox
1✔
34
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
35
    Diagram,
36
    PartsList,
37
    Step,
38
    StepNumber,
39
)
40

41
log = logging.getLogger(__name__)
1✔
42

43

44
@dataclass
1✔
45
class _StepScore:
1✔
46
    """Internal score representation for step classification."""
47

48
    step_number: StepNumber
49
    """The step number this step is associated with."""
1✔
50

51
    has_parts_list: bool
52
    """Whether this step has an associated parts list."""
1✔
53

54
    step_proximity_score: float
55
    """Score based on proximity to the PartsList above (0.0-1.0).
1✔
56
    1.0 for closest proximity, 0.0 if very far. 0.0 if no parts list."""
57

58
    step_alignment_score: float
59
    """Score based on left-edge alignment with PartsList above (0.0-1.0).
1✔
60
    1.0 is perfect alignment, 0.0 is very misaligned. 0.0 if no parts list."""
61

62
    diagram_area: float
63
    """Area of the diagram region."""
1✔
64

65
    def pairing_score(self) -> float:
1✔
66
        """Calculate pairing quality score (average of proximity and alignment)."""
67
        if not self.has_parts_list:
1✔
68
            return 0.0
1✔
69
        return (self.step_proximity_score + self.step_alignment_score) / 2.0
1✔
70

71
    def sort_key(self) -> tuple[float, int]:
1✔
72
        """Return a tuple for sorting candidates.
73

74
        We prefer:
75
        1. Higher pairing scores (better StepNumber-PartsList match)
76
        2. Lower step number values (to break ties and maintain order)
77
        """
78
        return (-self.pairing_score(), self.step_number.value)
1✔
79

80

81
@dataclass(frozen=True)
1✔
82
class StepClassifier(LabelClassifier):
1✔
83
    """Classifier for complete Step structures."""
84

85
    outputs = frozenset({"step"})
1✔
86
    requires = frozenset({"step_number", "parts_list"})
1✔
87

88
    def evaluate(self, result: ClassificationResult) -> None:
1✔
89
        """Evaluate elements and create deduplicated Step candidates.
90

91
        Creates Step candidates for each StepNumber, scoring all possible pairings
92
        with PartsLists, then greedily selects the best pairings to ensure each
93
        StepNumber value and PartsList is used at most once.
94
        """
95
        page_data = result.page_data
1✔
96

97
        # Get step numbers and parts lists using score-based selection
98
        steps = result.get_winners_by_score("step_number", StepNumber)
1✔
99

100
        if not steps:
1✔
101
            return
1✔
102

103
        # Get parts_list candidates by score
104
        parts_lists = result.get_winners_by_score("parts_list", PartsList)
1✔
105

106
        log.debug(
1✔
107
            "[step] page=%s steps=%d parts_lists=%d",
108
            page_data.page_number,
109
            len(steps),
110
            len(parts_lists),
111
        )
112

113
        # Create all possible Step candidates for pairings
114
        all_candidates: list[Candidate] = []
1✔
115
        for step_num in steps:
1✔
116
            # Create candidates for this StepNumber paired with each PartsList
117
            for parts_list in parts_lists:
1✔
118
                candidate = self._create_step_candidate(step_num, parts_list, result)
1✔
119
                if candidate:
1✔
120
                    all_candidates.append(candidate)
1✔
121

122
            # Also create a candidate with no PartsList (fallback)
123
            candidate = self._create_step_candidate(step_num, None, result)
1✔
124
            if candidate:
1✔
125
                all_candidates.append(candidate)
1✔
126

127
        # Greedily select the best candidates (deduplication)
128
        deduplicated_candidates = self._deduplicate_candidates(all_candidates)
1✔
129

130
        # Add the deduplicated candidates to the result
131
        for candidate in deduplicated_candidates:
1✔
132
            result.add_candidate("step", candidate)
1✔
133

134
        log.debug(
1✔
135
            "[step] Created %d deduplicated step candidates (from %d possibilities)",
136
            len(deduplicated_candidates),
137
            len(all_candidates),
138
        )
139

140
    def _create_step_candidate(
1✔
141
        self,
142
        step_num: StepNumber,
143
        parts_list: PartsList | None,
144
        result: ClassificationResult,
145
    ) -> Candidate | None:
146
        """Create a Step candidate for a StepNumber paired with a PartsList (or None).
147

148
        Args:
149
            step_num: The StepNumber for this candidate
150
            parts_list: The PartsList to pair with (or None for no pairing)
151
            result: Classification result to add the candidate to
152

153
        Returns:
154
            The created Candidate, or None if creation failed
155
        """
156
        ABOVE_EPS = 2.0  # Small epsilon for "above" check
1✔
157
        ALIGNMENT_THRESHOLD_MULTIPLIER = 1.0  # Max horizontal offset
1✔
158
        DISTANCE_THRESHOLD_MULTIPLIER = 1.0  # Max vertical distance
1✔
159

160
        # Calculate pairing scores if there's a parts_list above the step
161
        proximity_score = 0.0
1✔
162
        alignment_score = 0.0
1✔
163

164
        if (
1✔
165
            parts_list is not None
166
            and parts_list.bbox.y1 <= step_num.bbox.y0 + ABOVE_EPS
167
        ):
168
            # Calculate distance (how far apart vertically)
169
            distance = step_num.bbox.y0 - parts_list.bbox.y1
1✔
170

171
            # Calculate proximity score
172
            max_distance = step_num.bbox.height * DISTANCE_THRESHOLD_MULTIPLIER
1✔
173
            if max_distance > 0:
1✔
174
                proximity_score = max(0.0, 1.0 - (distance / max_distance))
1✔
175

176
            # Calculate alignment score (how well left edges align)
177
            max_alignment_diff = step_num.bbox.width * ALIGNMENT_THRESHOLD_MULTIPLIER
1✔
178
            left_diff = abs(parts_list.bbox.x0 - step_num.bbox.x0)
1✔
179
            if max_alignment_diff > 0:
1✔
180
                alignment_score = max(0.0, 1.0 - (left_diff / max_alignment_diff))
1✔
181

182
        # Identify diagram region
183
        diagram_bbox = self._identify_diagram_region(step_num, parts_list, result)
1✔
184

185
        # Build Step
186
        diagram = Diagram(bbox=diagram_bbox)
1✔
187
        constructed = Step(
1✔
188
            bbox=self._compute_step_bbox(step_num, parts_list, diagram),
189
            step_number=step_num,
190
            parts_list=parts_list or PartsList(bbox=step_num.bbox, parts=[]),
191
            diagram=diagram,
192
        )
193

194
        # Create score
195
        score = _StepScore(
1✔
196
            step_number=step_num,
197
            has_parts_list=parts_list is not None,
198
            step_proximity_score=proximity_score,
199
            step_alignment_score=alignment_score,
200
            diagram_area=diagram_bbox.area,
201
        )
202

203
        # Create candidate
204
        step_candidate = Candidate(
1✔
205
            bbox=constructed.bbox,
206
            label="step",
207
            score=score.pairing_score(),
208
            score_details=score,
209
            constructed=constructed,
210
            source_block=None,
211
            failure_reason=None,
212
        )
213

214
        return step_candidate
1✔
215

216
    def _identify_diagram_region(
1✔
217
        self,
218
        step_num: StepNumber,
219
        parts_list: PartsList | None,
220
        result: ClassificationResult,
221
    ) -> BBox:
222
        """Identify the diagram region for a step.
223

224
        The diagram is typically the large area below the step number and parts list.
225
        For now, we create a simple heuristic-based region.
226

227
        Args:
228
            step_num: The step number
229
            parts_list: The associated parts list (if any)
230
            result: Classification result containing page_data
231

232
        Returns:
233
            BBox representing the diagram region
234
        """
235
        page_data = result.page_data
1✔
236
        # Simple heuristic: use the step number's bbox as a starting point
237
        # In the future, we should look for actual drawing elements below the step
238

239
        # Start with step number position
240
        x0 = step_num.bbox.x0
1✔
241
        y0 = step_num.bbox.y1  # Below the step number
1✔
242

243
        # If there's a parts list, the diagram should be below it
244
        if parts_list:
1✔
245
            y0 = max(y0, parts_list.bbox.y1)
1✔
246

247
        # Extend to a reasonable area (placeholder logic)
248
        # TODO: Find actual drawing elements and use their bounds
249
        page_bbox = page_data.bbox
1✔
250
        assert page_bbox is not None
1✔
251

252
        # Use the rest of the page width and height as a simple approximation
253
        x1 = page_bbox.x1
1✔
254
        y1 = page_bbox.y1
1✔
255

256
        # Create a bbox for the diagram region
257
        return BBox(x0=x0, y0=y0, x1=x1, y1=y1)
1✔
258

259
    def _compute_step_bbox(
1✔
260
        self,
261
        step_num: StepNumber,
262
        parts_list: PartsList | None,
263
        diagram: Diagram,
264
    ) -> BBox:
265
        """Compute the overall bounding box for the Step.
266

267
        This encompasses the step number, parts list (if any), and diagram.
268

269
        Args:
270
            step_num: The step number element
271
            parts_list: The parts list (if any)
272
            diagram: The diagram element
273

274
        Returns:
275
            Combined bounding box
276
        """
277
        bboxes = [step_num.bbox, diagram.bbox]
1✔
278
        if parts_list:
1✔
279
            bboxes.append(parts_list.bbox)
1✔
280

281
        return BBox.union_all(bboxes)
1✔
282

283
    def _deduplicate_candidates(self, candidates: list[Candidate]) -> list[Candidate]:
1✔
284
        """Greedily select the best Step candidates.
285

286
        Ensures each StepNumber value and each PartsList is used at most once.
287

288
        Args:
289
            candidates: All possible Step candidates
290

291
        Returns:
292
            Deduplicated list of Step candidates
293
        """
294
        # Sort candidates by score (highest first)
295
        sorted_candidates = sorted(
1✔
296
            candidates,
297
            key=lambda c: c.score_details.sort_key(),
298
        )
299

300
        # Track which StepNumber values and PartsLists have been used
301
        used_step_values: set[int] = set()
1✔
302
        used_parts_list_ids: set[int] = set()
1✔
303
        selected: list[Candidate] = []
1✔
304

305
        # Greedily select winners
306
        for candidate in sorted_candidates:
1✔
307
            if candidate.constructed is None:
1✔
308
                continue
×
309

310
            assert isinstance(candidate.constructed, Step)
1✔
311
            step = candidate.constructed
1✔
312
            step_value = step.step_number.value
1✔
313
            parts_list = step.parts_list if len(step.parts_list.parts) > 0 else None
1✔
314

315
            # Skip if this step number value is already used
316
            if step_value in used_step_values:
1✔
317
                log.debug(
1✔
318
                    "[step] Skipping candidate for step %d - value already used",
319
                    step_value,
320
                )
321
                continue
1✔
322

323
            # Skip if this parts_list is already used (if it has parts)
324
            if parts_list is not None:
1✔
325
                parts_list_id = id(parts_list)
1✔
326
                if parts_list_id in used_parts_list_ids:
1✔
327
                    log.debug(
1✔
328
                        "[step] Skipping candidate for step %d - "
329
                        "PartsList already used",
330
                        step_value,
331
                    )
332
                    continue
1✔
333
                # Claim this parts_list
334
                used_parts_list_ids.add(parts_list_id)
1✔
335

336
            # Select this candidate
337
            selected.append(candidate)
1✔
338
            used_step_values.add(step_value)
1✔
339

340
            log.debug(
1✔
341
                "[step] Selected step %d (parts_list=%s, pairing_score=%.2f)",
342
                step_value,
343
                "yes" if parts_list is not None else "no",
344
                candidate.score_details.pairing_score(),
345
            )
346

347
        return selected
1✔
348

349
    def classify(self, result: ClassificationResult) -> None:
1✔
350
        """No-op - deduplication is done in evaluate().
351

352
        This is part of a refactoring to eliminate the is_winner flag and
353
        mark_winner() method. Selection logic now happens in evaluate() where
354
        only the final deduplicated candidates are created.
355
        """
356
        pass
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc