• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19727090583

27 Nov 2025 06:15AM UTC coverage: 89.781% (+0.8%) from 88.977%
19727090583

push

github

bramp
Multiple improves to classifers, specific around documentations, removing unused fields, and improving type hinting.

26 of 26 new or added lines in 14 files covered. (100.0%)

94 existing lines in 17 files now uncovered.

7327 of 8161 relevant lines covered (89.78%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.22
/src/build_a_long/pdf_extract/classifier/step_classifier.py
1
"""
2
Step classifier.
3

4
Purpose
5
-------
6
Identify complete Step structures by combining step_number, parts_list, and diagram
7
elements. A Step represents a single building instruction comprising:
8
- A StepNumber label
9
- An optional PartsList (the parts needed for this step)
10
- A Diagram (the main instruction graphic showing what to build)
11

12
We look for step_numbers and attempt to pair them with nearby parts_lists and
13
identify the appropriate diagram region for each step.
14

15
Debugging
16
---------
17
Set environment variables to aid investigation without code changes:
18

19
- LOG_LEVEL=DEBUG
20
    Enables DEBUG-level logging (if not already configured by caller).
21
"""
22

23
import logging
1✔
24
from dataclasses import dataclass
1✔
25

26
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
27
    Candidate,
28
    ClassificationResult,
29
)
30
from build_a_long.pdf_extract.classifier.label_classifier import (
1✔
31
    LabelClassifier,
32
)
33
from build_a_long.pdf_extract.classifier.text_extractors import (
1✔
34
    extract_step_number_value,
35
)
36
from build_a_long.pdf_extract.extractor.bbox import BBox
1✔
37
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
38
    Diagram,
39
    PartsList,
40
    Step,
41
    StepNumber,
42
)
43
from build_a_long.pdf_extract.extractor.page_blocks import Text
1✔
44

45
log = logging.getLogger(__name__)
1✔
46

47

48
@dataclass
1✔
49
class _StepScore:
1✔
50
    """Internal score representation for step classification."""
51

52
    step_number_candidate: Candidate
53
    """The step number candidate this step is associated with."""
1✔
54

55
    parts_list_candidate: Candidate | None
56
    """The parts list candidate paired with this step (if any)."""
1✔
57

58
    has_parts_list: bool
59
    """Whether this step has an associated parts list."""
1✔
60

61
    step_proximity_score: float
62
    """Score based on proximity to the PartsList above (0.0-1.0).
1✔
63
    1.0 for closest proximity, 0.0 if very far. 0.0 if no parts list."""
64

65
    step_alignment_score: float
66
    """Score based on left-edge alignment with PartsList above (0.0-1.0).
1✔
67
    1.0 is perfect alignment, 0.0 is very misaligned. 0.0 if no parts list."""
68

69
    diagram_area: float
70
    """Area of the diagram region."""
1✔
71

72
    def pairing_score(self) -> float:
1✔
73
        """Calculate pairing quality score (average of proximity and alignment)."""
74
        if not self.has_parts_list:
1✔
75
            return 0.0
1✔
76
        return (self.step_proximity_score + self.step_alignment_score) / 2.0
1✔
77

78
    def sort_key(self) -> tuple[float, int]:
1✔
79
        """Return a tuple for sorting candidates.
80

81
        We prefer:
82
        1. Higher pairing scores (better StepNumber-PartsList match)
83
        2. Lower step number values (to break ties and maintain order)
84
        """
85
        # Extract step number value from candidate's source block
86
        step_num_candidate = self.step_number_candidate
1✔
87

88
        # Assume single source block for step number
89
        if step_num_candidate.source_blocks and isinstance(
1✔
90
            step_num_candidate.source_blocks[0], Text
91
        ):
92
            text_block = step_num_candidate.source_blocks[0]
1✔
93
            step_value = extract_step_number_value(text_block.text)
1✔
94
            if step_value is not None:
1✔
95
                return (-self.pairing_score(), step_value)
1✔
96

97
        return (-self.pairing_score(), 0)  # Fallback if value cannot be extracted
×
98

99

100
@dataclass(frozen=True)
1✔
101
class StepClassifier(LabelClassifier):
1✔
102
    """Classifier for complete Step structures."""
103

104
    output = "step"
1✔
105
    requires = frozenset({"step_number", "parts_list"})
1✔
106

107
    def _score(self, result: ClassificationResult) -> None:
1✔
108
        """Score step pairings and create candidates."""
109
        page_data = result.page_data
1✔
110

111
        # Get step number and parts list candidates (not constructed elements)
112
        step_candidates = result.get_scored_candidates(
1✔
113
            "step_number", valid_only=False, exclude_failed=True
114
        )
115

116
        if not step_candidates:
1✔
117
            return
1✔
118

119
        # Get parts_list candidates
120
        parts_list_candidates = result.get_scored_candidates(
1✔
121
            "parts_list",
122
            valid_only=False,
123
            exclude_failed=True,
124
        )
125

126
        log.debug(
1✔
127
            "[step] page=%s step_candidates=%d parts_list_candidates=%d",
128
            page_data.page_number,
129
            len(step_candidates),
130
            len(parts_list_candidates),
131
        )
132

133
        # Create all possible Step candidates for pairings
134
        all_candidates: list[Candidate] = []
1✔
135
        for step_candidate in step_candidates:
1✔
136
            # Create candidates for this StepNumber paired with each PartsList
137
            for parts_list_candidate in parts_list_candidates:
1✔
138
                candidate = self._create_step_candidate(
1✔
139
                    step_candidate, parts_list_candidate, result
140
                )
141
                if candidate:
1✔
142
                    all_candidates.append(candidate)
1✔
143

144
            # Also create a candidate with no PartsList (fallback)
145
            candidate = self._create_step_candidate(step_candidate, None, result)
1✔
146
            if candidate:
1✔
147
                all_candidates.append(candidate)
1✔
148

149
        # Greedily select the best candidates (deduplication)
150
        deduplicated_candidates = self._deduplicate_candidates(all_candidates)
1✔
151

152
        # Add the deduplicated candidates to the result
153
        for candidate in deduplicated_candidates:
1✔
154
            result.add_candidate(candidate)
1✔
155

156
        log.debug(
1✔
157
            "[step] Created %d deduplicated step candidates (from %d possibilities)",
158
            len(deduplicated_candidates),
159
            len(all_candidates),
160
        )
161

162
    def build(self, candidate: Candidate, result: ClassificationResult) -> Step:
1✔
163
        """Construct a Step element from a single candidate."""
164
        score = candidate.score_details
1✔
165
        assert isinstance(score, _StepScore)
1✔
166

167
        # Validate and extract step number from parent candidate
168
        step_num_candidate = score.step_number_candidate
1✔
169

170
        step_num_elem = result.build(step_num_candidate)
1✔
171
        assert isinstance(step_num_elem, StepNumber)
1✔
172
        step_num = step_num_elem
1✔
173

174
        # Validate and extract parts list from parent candidate (if present)
175
        parts_list = None
1✔
176
        if score.parts_list_candidate:
1✔
177
            parts_list_candidate = score.parts_list_candidate
1✔
178
            parts_list_elem = result.build(parts_list_candidate)
1✔
179
            assert isinstance(parts_list_elem, PartsList)
1✔
180
            parts_list = parts_list_elem
1✔
181

182
        # Identify diagram region
183
        diagram_bbox = self._identify_diagram_region(
1✔
184
            step_num.bbox, parts_list.bbox if parts_list else None, result
185
        )
186

187
        # Build Step
188
        diagram = Diagram(bbox=diagram_bbox)
1✔
189
        return Step(
1✔
190
            bbox=self._compute_step_bbox(step_num, parts_list, diagram),
191
            step_number=step_num,
192
            parts_list=parts_list or PartsList(bbox=step_num.bbox, parts=[]),
193
            diagram=diagram,
194
        )
195

196
    def _create_step_candidate(
1✔
197
        self,
198
        step_candidate: Candidate,
199
        parts_list_candidate: Candidate | None,
200
        result: ClassificationResult,
201
    ) -> Candidate | None:
202
        """Create a Step candidate.
203

204
        Args:
205
            step_candidate: The StepNumber candidate for this step
206
            parts_list_candidate: The PartsList candidate to pair with (or None)
207
            result: Classification result
208

209
        Returns:
210
            The created Candidate with score but no construction
211
        """
212
        ABOVE_EPS = 2.0  # Small epsilon for "above" check
1✔
213
        ALIGNMENT_THRESHOLD_MULTIPLIER = 1.0  # Max horizontal offset
1✔
214
        DISTANCE_THRESHOLD_MULTIPLIER = 1.0  # Max vertical distance
1✔
215

216
        step_bbox = step_candidate.bbox
1✔
217
        parts_list_bbox = parts_list_candidate.bbox if parts_list_candidate else None
1✔
218

219
        # Calculate pairing scores if there's a parts_list above the step
220
        proximity_score = 0.0
1✔
221
        alignment_score = 0.0
1✔
222

223
        if (
1✔
224
            parts_list_bbox is not None
225
            and parts_list_bbox.y1 <= step_bbox.y0 + ABOVE_EPS
226
        ):
227
            # Calculate distance (how far apart vertically)
228
            distance = step_bbox.y0 - parts_list_bbox.y1
1✔
229

230
            # Calculate proximity score
231
            max_distance = step_bbox.height * DISTANCE_THRESHOLD_MULTIPLIER
1✔
232
            if max_distance > 0:
1✔
233
                proximity_score = max(0.0, 1.0 - (distance / max_distance))
1✔
234

235
            # Calculate alignment score (how well left edges align)
236
            max_alignment_diff = step_bbox.width * ALIGNMENT_THRESHOLD_MULTIPLIER
1✔
237
            left_diff = abs(parts_list_bbox.x0 - step_bbox.x0)
1✔
238
            if max_alignment_diff > 0:
1✔
239
                alignment_score = max(0.0, 1.0 - (left_diff / max_alignment_diff))
1✔
240

241
        # Estimate diagram bbox for scoring purposes
242
        diagram_bbox = self._identify_diagram_region(step_bbox, parts_list_bbox, result)
1✔
243

244
        # Create score object with candidate references
245
        score = _StepScore(
1✔
246
            step_number_candidate=step_candidate,
247
            parts_list_candidate=parts_list_candidate,
248
            has_parts_list=parts_list_candidate is not None,
249
            step_proximity_score=proximity_score,
250
            step_alignment_score=alignment_score,
251
            diagram_area=diagram_bbox.area,
252
        )
253

254
        # Calculate combined bbox for the candidate
255
        bboxes = [step_bbox, diagram_bbox]
1✔
256
        if parts_list_bbox:
1✔
257
            bboxes.append(parts_list_bbox)
1✔
258
        combined_bbox = BBox.union_all(bboxes)
1✔
259

260
        # Create candidate
261
        return Candidate(
1✔
262
            bbox=combined_bbox,
263
            label="step",
264
            score=score.pairing_score(),
265
            score_details=score,
266
            source_blocks=[],
267
        )
268

269
    def _identify_diagram_region(
1✔
270
        self,
271
        step_bbox: BBox,
272
        parts_list_bbox: BBox | None,
273
        result: ClassificationResult,
274
    ) -> BBox:
275
        """Identify the diagram region for a step.
276

277
        The diagram is typically the large area below the step number and parts list.
278
        For now, we create a simple heuristic-based region.
279

280
        Args:
281
            step_bbox: The step number bbox
282
            parts_list_bbox: The associated parts list bbox (if any)
283
            result: Classification result containing page_data
284

285
        Returns:
286
            BBox representing the diagram region
287
        """
288
        page_data = result.page_data
1✔
289
        # Simple heuristic: use the step number's bbox as a starting point
290
        # In the future, we should look for actual drawing elements below the step
291

292
        # Start with step number position
293
        x0 = step_bbox.x0
1✔
294
        y0 = step_bbox.y1  # Below the step number
1✔
295

296
        # If there's a parts list, the diagram should be below it
297
        if parts_list_bbox:
1✔
298
            y0 = max(y0, parts_list_bbox.y1)
1✔
299

300
        # Extend to a reasonable area (placeholder logic)
301
        # TODO: Find actual drawing elements and use their bounds
302
        page_bbox = page_data.bbox
1✔
303
        assert page_bbox is not None
1✔
304

305
        # Use the rest of the page width and height as a simple approximation
306
        x1 = page_bbox.x1
1✔
307
        y1 = page_bbox.y1
1✔
308

309
        # Create a bbox for the diagram region
310
        return BBox(x0=x0, y0=y0, x1=x1, y1=y1)
1✔
311

312
    def _compute_step_bbox(
1✔
313
        self,
314
        step_num: StepNumber,
315
        parts_list: PartsList | None,
316
        diagram: Diagram,
317
    ) -> BBox:
318
        """Compute the overall bounding box for the Step.
319

320
        This encompasses the step number, parts list (if any), and diagram.
321

322
        Args:
323
            step_num: The step number element
324
            parts_list: The parts list (if any)
325
            diagram: The diagram element
326

327
        Returns:
328
            Combined bounding box
329
        """
330
        bboxes = [step_num.bbox, diagram.bbox]
1✔
331
        if parts_list:
1✔
332
            bboxes.append(parts_list.bbox)
1✔
333

334
        return BBox.union_all(bboxes)
1✔
335

336
    def _deduplicate_candidates(self, candidates: list[Candidate]) -> list[Candidate]:
1✔
337
        """Greedily select the best Step candidates.
338

339
        Ensures each StepNumber value and each PartsList is used at most once.
340

341
        Args:
342
            candidates: All possible Step candidates
343

344
        Returns:
345
            Deduplicated list of Step candidates
346
        """
347
        # Sort candidates by score (highest first)
348
        sorted_candidates = sorted(
1✔
349
            candidates,
350
            key=lambda c: c.score_details.sort_key(),
351
        )
352

353
        # Track which StepNumber values and PartsLists have been used
354
        used_step_values: set[int] = set()
1✔
355
        used_parts_list_ids: set[int] = set()
1✔
356
        selected: list[Candidate] = []
1✔
357

358
        # Greedily select winners
359
        for candidate in sorted_candidates:
1✔
360
            # Get step info from score_details (candidates not yet constructed)
361
            assert isinstance(candidate.score_details, _StepScore)
1✔
362
            score = candidate.score_details
1✔
363

364
            # Extract step number value from parent candidate source block
365
            step_num_candidate = score.step_number_candidate
1✔
366

367
            # Extract step value from text block
368
            if not step_num_candidate.source_blocks:
1✔
UNCOV
369
                continue
×
370
            text_block = step_num_candidate.source_blocks[0]
1✔
371
            if not isinstance(text_block, Text):
1✔
UNCOV
372
                continue
×
373

374
            step_value = extract_step_number_value(text_block.text)
1✔
375
            if step_value is None:
1✔
UNCOV
376
                continue
×
377

378
            # Extract parts list from parent candidate (if present)
379
            parts_list_candidate = score.parts_list_candidate
1✔
380

381
            # Skip if this step number value is already used
382
            if step_value in used_step_values:
1✔
383
                log.debug(
1✔
384
                    "[step] Skipping candidate for step %d - value already used",
385
                    step_value,
386
                )
387
                continue
1✔
388

389
            # Skip if this parts_list is already used (if it has parts)
390
            if parts_list_candidate is not None:
1✔
391
                # Check if parts list has parts (look at its score details)
392
                has_parts = False
1✔
393
                if hasattr(parts_list_candidate.score_details, "part_candidates"):
1✔
394
                    has_parts = (
1✔
395
                        len(parts_list_candidate.score_details.part_candidates) > 0
396
                    )
397

398
                if has_parts:
1✔
399
                    parts_list_id = id(parts_list_candidate)
1✔
400
                    if parts_list_id in used_parts_list_ids:
1✔
401
                        log.debug(
1✔
402
                            "[step] Skipping candidate for step %d - "
403
                            "PartsList candidate already used",
404
                            step_value,
405
                        )
406
                        continue
1✔
407
                    # Claim this parts_list
408
                    used_parts_list_ids.add(parts_list_id)
1✔
409

410
            # Select this candidate
411
            selected.append(candidate)
1✔
412
            used_step_values.add(step_value)
1✔
413

414
            log.debug(
1✔
415
                "[step] Selected step %d (parts_list=%s, pairing_score=%.2f)",
416
                step_value,
417
                "yes" if parts_list_candidate is not None else "no",
418
                score.pairing_score(),
419
            )
420

421
        return selected
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc