• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19251794703

11 Nov 2025 01:25AM UTC coverage: 90.748% (+3.9%) from 86.822%
19251794703

push

github

bramp
Update golden files to reflect improved parts list classification

The parts_list_max_area_ratio filter now correctly rejects full-page
drawings (bbox: 0,0 to 552.76,496.06) that were previously incorrectly
classified as parts lists.

Updated golden files:
- 6509377_page_015_expected.json: Full-page drawing rejected, now uses
  actual parts list with proper bbox
- 6509377_page_180_expected.json: Full-page drawing rejected, now uses
  actual parts list with proper bbox

These changes reflect the correct behavior where drawings occupying
>75% of the page area are rejected as likely background elements.

4708 of 5188 relevant lines covered (90.75%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.05
/src/build_a_long/pdf_extract/classifier/step_classifier.py
1
"""
2
Step classifier.
3

4
Purpose
5
-------
6
Identify complete Step structures by combining step_number, parts_list, and diagram
7
elements. A Step represents a single building instruction comprising:
8
- A StepNumber label
9
- An optional PartsList (the parts needed for this step)
10
- A Diagram (the main instruction graphic showing what to build)
11

12
We look for step_numbers and attempt to pair them with nearby parts_lists and
13
identify the appropriate diagram region for each step.
14

15
Debugging
16
---------
17
Set environment variables to aid investigation without code changes:
18

19
- LOG_LEVEL=DEBUG
20
    Enables DEBUG-level logging (if not already configured by caller).
21
"""
22

23
import logging
1✔
24
from dataclasses import dataclass
1✔
25

26
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
27
    Candidate,
28
    ClassificationResult,
29
)
30
from build_a_long.pdf_extract.classifier.label_classifier import (
1✔
31
    LabelClassifier,
32
)
33
from build_a_long.pdf_extract.extractor.bbox import BBox
1✔
34
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
35
    Diagram,
36
    PartsList,
37
    Step,
38
    StepNumber,
39
)
40

41
log = logging.getLogger(__name__)
1✔
42

43

44
@dataclass
1✔
45
class _StepScore:
1✔
46
    """Internal score representation for step classification."""
47

48
    step_number: StepNumber
49
    """The step number this step is associated with."""
1✔
50

51
    has_parts_list: bool
52
    """Whether this step has an associated parts list."""
1✔
53

54
    step_proximity_score: float
55
    """Score based on proximity to the PartsList above (0.0-1.0).
1✔
56
    1.0 for closest proximity, 0.0 if very far. 0.0 if no parts list."""
57

58
    step_alignment_score: float
59
    """Score based on left-edge alignment with PartsList above (0.0-1.0).
1✔
60
    1.0 is perfect alignment, 0.0 is very misaligned. 0.0 if no parts list."""
61

62
    diagram_area: float
63
    """Area of the diagram region."""
1✔
64

65
    def pairing_score(self) -> float:
1✔
66
        """Calculate pairing quality score (average of proximity and alignment)."""
67
        if not self.has_parts_list:
1✔
68
            return 0.0
1✔
69
        return (self.step_proximity_score + self.step_alignment_score) / 2.0
1✔
70

71
    def sort_key(self) -> tuple[float, int]:
1✔
72
        """Return a tuple for sorting candidates.
73

74
        We prefer:
75
        1. Higher pairing scores (better StepNumber-PartsList match)
76
        2. Lower step number values (to break ties and maintain order)
77
        """
78
        return (-self.pairing_score(), self.step_number.value)
1✔
79

80

81
@dataclass(frozen=True)
1✔
82
class StepClassifier(LabelClassifier):
1✔
83
    """Classifier for complete Step structures."""
84

85
    outputs = frozenset({"step"})
1✔
86
    requires = frozenset({"step_number", "parts_list"})
1✔
87

88
    def evaluate(self, result: ClassificationResult) -> None:
1✔
89
        """Evaluate elements and create Step candidates for all possible pairings.
90

91
        Creates Step candidates for each StepNumber, scoring all possible pairings
92
        with PartsLists. Candidates are stored in ClassificationResult, and the
93
        best ones will be selected in classify().
94
        """
95
        page_data = result.page_data
1✔
96

97
        # Get step_number candidates
98
        step_candidates = result.get_candidates("step_number")
1✔
99
        steps: list[StepNumber] = []
1✔
100

101
        for candidate in step_candidates:
1✔
102
            if (
1✔
103
                candidate.is_winner
104
                and candidate.constructed is not None
105
                and isinstance(candidate.constructed, StepNumber)
106
            ):
107
                steps.append(candidate.constructed)
1✔
108

109
        if not steps:
1✔
110
            return
1✔
111

112
        # Get parts_list candidates (winners only)
113
        parts_list_candidates = result.get_candidates("parts_list")
1✔
114
        parts_lists: list[PartsList] = []
1✔
115

116
        for candidate in parts_list_candidates:
1✔
117
            if (
1✔
118
                candidate.is_winner
119
                and candidate.constructed is not None
120
                and isinstance(candidate.constructed, PartsList)
121
            ):
122
                parts_lists.append(candidate.constructed)
1✔
123

124
        log.debug(
1✔
125
            "[step] page=%s steps=%d parts_lists=%d",
126
            page_data.page_number,
127
            len(steps),
128
            len(parts_lists),
129
        )
130

131
        # Create Step candidates for all possible pairings
132
        for step_num in steps:
1✔
133
            # Create candidates for this StepNumber paired with each PartsList
134
            for parts_list in parts_lists:
1✔
135
                self._create_step_candidate(step_num, parts_list, result)
1✔
136

137
            # Also create a candidate with no PartsList (fallback)
138
            self._create_step_candidate(step_num, None, result)
1✔
139

140
        log.debug(
1✔
141
            "[step] Created %d step candidates",
142
            len(result.get_candidates("step")),
143
        )
144

145
    def _create_step_candidate(
1✔
146
        self,
147
        step_num: StepNumber,
148
        parts_list: PartsList | None,
149
        result: ClassificationResult,
150
    ) -> None:
151
        """Create a Step candidate for a StepNumber paired with a PartsList (or None).
152

153
        Args:
154
            step_num: The StepNumber for this candidate
155
            parts_list: The PartsList to pair with (or None for no pairing)
156
            result: Classification result to add the candidate to
157
        """
158
        ABOVE_EPS = 2.0  # Small epsilon for "above" check
1✔
159
        ALIGNMENT_THRESHOLD_MULTIPLIER = 1.0  # Max horizontal offset
1✔
160
        DISTANCE_THRESHOLD_MULTIPLIER = 1.0  # Max vertical distance
1✔
161

162
        # Calculate pairing scores if there's a parts_list above the step
163
        proximity_score = 0.0
1✔
164
        alignment_score = 0.0
1✔
165

166
        if (
1✔
167
            parts_list is not None
168
            and parts_list.bbox.y1 <= step_num.bbox.y0 + ABOVE_EPS
169
        ):
170
            # Calculate distance (how far apart vertically)
171
            distance = step_num.bbox.y0 - parts_list.bbox.y1
1✔
172

173
            # Calculate proximity score
174
            max_distance = step_num.bbox.height * DISTANCE_THRESHOLD_MULTIPLIER
1✔
175
            if max_distance > 0:
1✔
176
                proximity_score = max(0.0, 1.0 - (distance / max_distance))
1✔
177

178
            # Calculate alignment score (how well left edges align)
179
            max_alignment_diff = step_num.bbox.width * ALIGNMENT_THRESHOLD_MULTIPLIER
1✔
180
            left_diff = abs(parts_list.bbox.x0 - step_num.bbox.x0)
1✔
181
            if max_alignment_diff > 0:
1✔
182
                alignment_score = max(0.0, 1.0 - (left_diff / max_alignment_diff))
1✔
183

184
        # Identify diagram region
185
        diagram_bbox = self._identify_diagram_region(step_num, parts_list, result)
1✔
186

187
        # Build Step
188
        diagram = Diagram(bbox=diagram_bbox)
1✔
189
        constructed = Step(
1✔
190
            bbox=self._compute_step_bbox(step_num, parts_list, diagram),
191
            step_number=step_num,
192
            parts_list=parts_list or PartsList(bbox=step_num.bbox, parts=[]),
193
            diagram=diagram,
194
        )
195

196
        # Create score
197
        score = _StepScore(
1✔
198
            step_number=step_num,
199
            has_parts_list=parts_list is not None,
200
            step_proximity_score=proximity_score,
201
            step_alignment_score=alignment_score,
202
            diagram_area=diagram_bbox.area,
203
        )
204

205
        # Add candidate (not yet a winner)
206
        step_candidate = Candidate(
1✔
207
            bbox=constructed.bbox,
208
            label="step",
209
            score=score.pairing_score(),
210
            score_details=score,
211
            constructed=constructed,
212
            source_block=None,
213
            failure_reason=None,
214
            is_winner=False,
215
        )
216

217
        result.add_candidate("step", step_candidate)
1✔
218

219
    def _identify_diagram_region(
1✔
220
        self,
221
        step_num: StepNumber,
222
        parts_list: PartsList | None,
223
        result: ClassificationResult,
224
    ) -> BBox:
225
        """Identify the diagram region for a step.
226

227
        The diagram is typically the large area below the step number and parts list.
228
        For now, we create a simple heuristic-based region.
229

230
        Args:
231
            step_num: The step number
232
            parts_list: The associated parts list (if any)
233
            result: Classification result containing page_data
234

235
        Returns:
236
            BBox representing the diagram region
237
        """
238
        page_data = result.page_data
1✔
239
        # Simple heuristic: use the step number's bbox as a starting point
240
        # In the future, we should look for actual drawing elements below the step
241

242
        # Start with step number position
243
        x0 = step_num.bbox.x0
1✔
244
        y0 = step_num.bbox.y1  # Below the step number
1✔
245

246
        # If there's a parts list, the diagram should be below it
247
        if parts_list:
1✔
248
            y0 = max(y0, parts_list.bbox.y1)
1✔
249

250
        # Extend to a reasonable area (placeholder logic)
251
        # TODO: Find actual drawing elements and use their bounds
252
        page_bbox = page_data.bbox
1✔
253
        assert page_bbox is not None
1✔
254

255
        # Use the rest of the page width and height as a simple approximation
256
        x1 = page_bbox.x1
1✔
257
        y1 = page_bbox.y1
1✔
258

259
        # Create a bbox for the diagram region
260
        return BBox(x0=x0, y0=y0, x1=x1, y1=y1)
1✔
261

262
    def _compute_step_bbox(
1✔
263
        self,
264
        step_num: StepNumber,
265
        parts_list: PartsList | None,
266
        diagram: Diagram,
267
    ) -> BBox:
268
        """Compute the overall bounding box for the Step.
269

270
        This encompasses the step number, parts list (if any), and diagram.
271

272
        Args:
273
            step_num: The step number element
274
            parts_list: The parts list (if any)
275
            diagram: The diagram element
276

277
        Returns:
278
            Combined bounding box
279
        """
280
        bboxes = [step_num.bbox, diagram.bbox]
1✔
281
        if parts_list:
1✔
282
            bboxes.append(parts_list.bbox)
1✔
283

284
        return BBox.union_all(bboxes)
1✔
285

286
    def classify(self, result: ClassificationResult) -> None:
1✔
287
        """Greedily select the best Step candidates.
288

289
        Uses the candidates created in evaluate() to select the best pairings.
290
        Ensures each StepNumber value and each PartsList is used at most once.
291
        """
292
        # Get all Step candidates
293
        candidate_list = result.get_candidates("step")
1✔
294

295
        # Sort candidates by score (highest first)
296
        sorted_candidates = sorted(
1✔
297
            candidate_list,
298
            key=lambda c: c.score_details.sort_key(),
299
        )
300

301
        # Track which StepNumber values and PartsLists have been used
302
        used_step_values: set[int] = set()
1✔
303
        used_parts_list_ids: set[int] = set()
1✔
304

305
        # Greedily select winners
306
        for candidate in sorted_candidates:
1✔
307
            if candidate.constructed is None:
1✔
308
                continue
×
309

310
            assert isinstance(candidate.constructed, Step)
1✔
311
            step = candidate.constructed
1✔
312
            step_value = step.step_number.value
1✔
313
            parts_list = step.parts_list if len(step.parts_list.parts) > 0 else None
1✔
314

315
            # Skip if this step number value is already used
316
            if step_value in used_step_values:
1✔
317
                log.debug(
1✔
318
                    "[step] Skipping candidate for step %d - value already used",
319
                    step_value,
320
                )
321
                continue
1✔
322

323
            # Skip if this parts_list is already used (if it has parts)
324
            if parts_list is not None:
1✔
325
                parts_list_id = id(parts_list)
1✔
326
                if parts_list_id in used_parts_list_ids:
1✔
327
                    log.debug(
1✔
328
                        "[step] Skipping candidate for step %d - "
329
                        "PartsList already used",
330
                        step_value,
331
                    )
332
                    continue
1✔
333
                # Claim this parts_list
334
                used_parts_list_ids.add(parts_list_id)
1✔
335

336
            # Mark this candidate as winner
337
            result.mark_winner(candidate, step)
1✔
338
            used_step_values.add(step_value)
1✔
339

340
            log.debug(
1✔
341
                "[step] Marking step %d as winner (parts_list=%s, pairing_score=%.2f)",
342
                step_value,
343
                "yes" if parts_list is not None else "no",
344
                candidate.score_details.pairing_score(),
345
            )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc