• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19995046189

06 Dec 2025 10:18PM UTC coverage: 90.506% (+0.09%) from 90.421%
19995046189

push

github

bramp
test: regenerate golden files for step classifier refactoring

10525 of 11629 relevant lines covered (90.51%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.4
/src/build_a_long/pdf_extract/classifier/steps/step_classifier.py
1
"""
2
Step classifier.
3

4
Purpose
5
-------
6
Identify complete Step structures by combining step_number, parts_list, and diagram
7
elements. A Step represents a single building instruction comprising:
8
- A StepNumber label
9
- An optional PartsList (the parts needed for this step)
10
- A Diagram (the main instruction graphic showing what to build)
11

12
We look for step_numbers and attempt to pair them with nearby parts_lists and
13
identify the appropriate diagram region for each step.
14

15
Debugging
16
---------
17
Set environment variables to aid investigation without code changes:
18

19
- LOG_LEVEL=DEBUG
20
    Enables DEBUG-level logging (if not already configured by caller).
21
"""
22

23
import logging
1✔
24

25
from build_a_long.pdf_extract.classifier.candidate import Candidate
1✔
26
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
27
    CandidateFailedError,
28
    ClassificationResult,
29
)
30
from build_a_long.pdf_extract.classifier.label_classifier import (
1✔
31
    LabelClassifier,
32
)
33
from build_a_long.pdf_extract.classifier.parts.parts_list_classifier import (
1✔
34
    _PartsListScore,
35
)
36
from build_a_long.pdf_extract.classifier.score import Score, Weight
1✔
37
from build_a_long.pdf_extract.classifier.text import (
1✔
38
    extract_step_number_value,
39
)
40
from build_a_long.pdf_extract.extractor.bbox import BBox
1✔
41
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
42
    Arrow,
43
    Diagram,
44
    PartsList,
45
    RotationSymbol,
46
    Step,
47
    StepNumber,
48
    SubAssembly,
49
)
50
from build_a_long.pdf_extract.extractor.page_blocks import Text
1✔
51

52
log = logging.getLogger(__name__)
1✔
53

54

55
class _StepScore(Score):
1✔
56
    """Internal score representation for step classification."""
57

58
    step_number_candidate: Candidate
59
    """The step number candidate this step is associated with."""
1✔
60

61
    parts_list_candidate: Candidate | None
62
    """The parts list candidate paired with this step (if any)."""
1✔
63

64
    has_parts_list: bool
65
    """Whether this step has an associated parts list."""
1✔
66

67
    step_proximity_score: float
68
    """Score based on proximity to the PartsList above (0.0-1.0).
1✔
69
    1.0 for closest proximity, 0.0 if very far. 0.0 if no parts list."""
70

71
    step_alignment_score: float
72
    """Score based on left-edge alignment with PartsList above (0.0-1.0).
1✔
73
    1.0 is perfect alignment, 0.0 is very misaligned. 0.0 if no parts list."""
74

75
    def score(self) -> Weight:
1✔
76
        """Return the overall pairing score."""
77
        return self.overall_score()
×
78

79
    def overall_score(self) -> float:
1✔
80
        """Calculate overall quality score based on parts list pairing.
81

82
        Steps with a parts_list are given a bonus to prefer them over
83
        steps without parts_list. Diagrams are found at build time, not
84
        during scoring, to allow rotation symbols to claim small images first.
85
        """
86
        if self.has_parts_list:
1✔
87
            # Base score for having parts_list + proximity/alignment bonus
88
            parts_list_bonus = 0.5
1✔
89
            pairing_score = (
1✔
90
                self.step_proximity_score + self.step_alignment_score
91
            ) / 2.0
92
            return parts_list_bonus + 0.5 * pairing_score
1✔
93
        return 0.3  # Lower base score for steps without parts list
1✔
94

95
    def sort_key(self) -> tuple[float, int]:
1✔
96
        """Return a tuple for sorting candidates.
97

98
        We prefer:
99
        1. Higher overall scores (better StepNumber-PartsList-Diagram match)
100
        2. Lower step number values (to break ties and maintain order)
101
        """
102
        # Extract step number value from candidate's source block
103
        step_num_candidate = self.step_number_candidate
1✔
104

105
        # Assume single source block for step number
106
        if step_num_candidate.source_blocks and isinstance(
1✔
107
            step_num_candidate.source_blocks[0], Text
108
        ):
109
            text_block = step_num_candidate.source_blocks[0]
1✔
110
            step_value = extract_step_number_value(text_block.text)
1✔
111
            if step_value is not None:
1✔
112
                return (-self.overall_score(), step_value)
1✔
113

114
        return (-self.overall_score(), 0)  # Fallback if value cannot be extracted
×
115

116

117
class StepClassifier(LabelClassifier):
1✔
118
    """Classifier for complete Step structures."""
119

120
    output = "step"
1✔
121
    requires = frozenset(
1✔
122
        {"step_number", "parts_list", "diagram", "rotation_symbol", "subassembly"}
123
    )
124

125
    def _score(self, result: ClassificationResult) -> None:
1✔
126
        """Score step pairings and create candidates."""
127
        page_data = result.page_data
1✔
128

129
        # Get step number and parts list candidates (not constructed elements)
130
        step_number_candidates = result.get_scored_candidates(
1✔
131
            "step_number", valid_only=False, exclude_failed=True
132
        )
133

134
        if not step_number_candidates:
1✔
135
            return
1✔
136

137
        # Get parts_list candidates
138
        parts_list_candidates = result.get_scored_candidates(
1✔
139
            "parts_list",
140
            valid_only=False,
141
            exclude_failed=True,
142
        )
143

144
        log.debug(
1✔
145
            "[step] page=%s step_candidates=%d parts_list_candidates=%d",
146
            page_data.page_number,
147
            len(step_number_candidates),
148
            len(parts_list_candidates),
149
        )
150

151
        # Create all possible Step candidates for pairings (without diagrams initially)
152
        all_candidates: list[Candidate] = []
1✔
153
        for step_candidate in step_number_candidates:
1✔
154
            # Create candidates for this StepNumber paired with each PartsList
155
            for parts_list_candidate in parts_list_candidates:
1✔
156
                candidate = self._create_step_candidate(
1✔
157
                    step_candidate, parts_list_candidate, result
158
                )
159
                if candidate:
1✔
160
                    all_candidates.append(candidate)
1✔
161

162
            # Also create a candidate with no PartsList (fallback)
163
            candidate = self._create_step_candidate(step_candidate, None, result)
1✔
164
            if candidate:
1✔
165
                all_candidates.append(candidate)
1✔
166

167
        # Greedily select the best candidates (deduplication)
168
        # This will assign diagrams as part of the selection process
169
        deduplicated_candidates = self._deduplicate_and_assign_diagrams(
1✔
170
            all_candidates, result
171
        )
172

173
        # Add the deduplicated candidates to the result
174
        for candidate in deduplicated_candidates:
1✔
175
            result.add_candidate(candidate)
1✔
176

177
        log.debug(
1✔
178
            "[step] Created %d deduplicated step candidates (from %d possibilities)",
179
            len(deduplicated_candidates),
180
            len(all_candidates),
181
        )
182

183
    def build(self, candidate: Candidate, result: ClassificationResult) -> Step:
1✔
184
        """Construct a Step element from a single candidate."""
185
        score = candidate.score_details
1✔
186
        assert isinstance(score, _StepScore)
1✔
187

188
        # Validate and extract step number from parent candidate
189
        step_num_candidate = score.step_number_candidate
1✔
190

191
        step_num_elem = result.build(step_num_candidate)
1✔
192
        assert isinstance(step_num_elem, StepNumber)
1✔
193
        step_num = step_num_elem
1✔
194

195
        # Validate and extract parts list from parent candidate (if present)
196
        parts_list = None
1✔
197
        if score.parts_list_candidate:
1✔
198
            parts_list_candidate = score.parts_list_candidate
1✔
199
            parts_list_elem = result.build(parts_list_candidate)
1✔
200
            assert isinstance(parts_list_elem, PartsList)
1✔
201
            parts_list = parts_list_elem
1✔
202

203
        # Build rotation symbol BEFORE diagram so it can claim small images
204
        # that might otherwise be clustered into the diagram.
205
        # At this point we don't have a diagram yet, so use step bbox for search.
206
        rotation_symbol = self._get_rotation_symbol_for_step(step_num, None, result)
1✔
207

208
        # Now find and build the best diagram for this step
209
        # This happens after rotation symbols are built, so they've already
210
        # claimed any small images they need
211
        diagram = self._find_and_build_diagram_for_step(step_num, parts_list, result)
1✔
212

213
        # Get arrows for this step (from subassemblies and other sources)
214
        arrows = self._get_arrows_for_step(step_num, diagram, result)
1✔
215

216
        # Get subassemblies for this step
217
        subassemblies = self._get_subassemblies_for_step(step_num, diagram, result)
1✔
218

219
        # Build Step - clip bbox to page bounds
220
        page_bbox = result.page_data.bbox
1✔
221
        return Step(
1✔
222
            bbox=self._compute_step_bbox(step_num, parts_list, diagram, page_bbox),
223
            step_number=step_num,
224
            parts_list=parts_list,
225
            diagram=diagram,
226
            rotation_symbol=rotation_symbol,
227
            arrows=arrows,
228
            subassemblies=subassemblies,
229
        )
230

231
    def _find_and_build_diagram_for_step(
1✔
232
        self,
233
        step_num: StepNumber,
234
        parts_list: PartsList | None,
235
        result: ClassificationResult,
236
    ) -> Diagram | None:
237
        """Find and build the best diagram for this step.
238

239
        This is called at build time, after rotation symbols have been built,
240
        so they've already claimed any small images they need. This ensures
241
        the diagram doesn't incorrectly cluster rotation symbol images.
242

243
        Args:
244
            step_num: The built step number element
245
            parts_list: The built parts list element (if any)
246
            result: Classification result containing diagram candidates
247

248
        Returns:
249
            The built Diagram element, or None if no suitable diagram found
250
        """
251
        # Get all non-failed, non-constructed diagram candidates
252
        diagram_candidates = result.get_scored_candidates(
1✔
253
            "diagram", valid_only=False, exclude_failed=True
254
        )
255

256
        # Filter to only candidates that haven't been built yet
257
        available_candidates = [c for c in diagram_candidates if c.constructed is None]
1✔
258

259
        if not available_candidates:
1✔
260
            log.debug(
1✔
261
                "[step] No diagram candidates available for step %d",
262
                step_num.value,
263
            )
264
            return None
1✔
265

266
        # Score each candidate based on position relative to step
267
        step_bbox = step_num.bbox
1✔
268
        best_candidate = None
1✔
269
        best_score = -float("inf")
1✔
270

271
        for candidate in available_candidates:
1✔
272
            score = self._score_step_diagram_pair(step_bbox, candidate.bbox)
1✔
273

274
            log.debug(
1✔
275
                "[step] Diagram candidate at %s for step %d: score=%.2f",
276
                candidate.bbox,
277
                step_num.value,
278
                score,
279
            )
280

281
            if score > best_score:
1✔
282
                best_score = score
1✔
283
                best_candidate = candidate
1✔
284

285
        if best_candidate is None or best_score < 0.2:
1✔
286
            log.debug(
×
287
                "[step] No suitable diagram found for step %d (best_score=%.2f)",
288
                step_num.value,
289
                best_score,
290
            )
291
            return None
×
292

293
        # Build the diagram
294
        try:
1✔
295
            diagram_elem = result.build(best_candidate)
1✔
296
            assert isinstance(diagram_elem, Diagram)
1✔
297
            log.debug(
1✔
298
                "[step] Built diagram at %s for step %d (score=%.2f)",
299
                diagram_elem.bbox,
300
                step_num.value,
301
                best_score,
302
            )
303
            return diagram_elem
1✔
304
        except CandidateFailedError as e:
×
305
            log.debug(
×
306
                "[step] Failed to build diagram for step %d: %s",
307
                step_num.value,
308
                e,
309
            )
310
            return None
×
311

312
    def _create_step_candidate(
1✔
313
        self,
314
        step_candidate: Candidate,
315
        parts_list_candidate: Candidate | None,
316
        result: ClassificationResult,
317
    ) -> Candidate | None:
318
        """Create a Step candidate (without diagram assignment).
319

320
        Diagrams are found at build time, not during scoring, to allow
321
        rotation symbols to claim small images first.
322

323
        Args:
324
            step_candidate: The StepNumber candidate for this step
325
            parts_list_candidate: The PartsList candidate to pair with (or None)
326
            result: Classification result
327

328
        Returns:
329
            The created Candidate with score but no construction
330
        """
331
        ABOVE_EPS = 2.0  # Small epsilon for "above" check
1✔
332
        ALIGNMENT_THRESHOLD_MULTIPLIER = 1.0  # Max horizontal offset
1✔
333
        DISTANCE_THRESHOLD_MULTIPLIER = 1.0  # Max vertical distance
1✔
334

335
        step_bbox = step_candidate.bbox
1✔
336
        parts_list_bbox = parts_list_candidate.bbox if parts_list_candidate else None
1✔
337

338
        # Calculate pairing scores if there's a parts_list above the step
339
        proximity_score = 0.0
1✔
340
        alignment_score = 0.0
1✔
341

342
        if (
1✔
343
            parts_list_bbox is not None
344
            and parts_list_bbox.y1 <= step_bbox.y0 + ABOVE_EPS
345
        ):
346
            # Calculate distance (how far apart vertically)
347
            distance = step_bbox.y0 - parts_list_bbox.y1
1✔
348

349
            # Calculate proximity score
350
            max_distance = step_bbox.height * DISTANCE_THRESHOLD_MULTIPLIER
1✔
351
            if max_distance > 0:
1✔
352
                proximity_score = max(0.0, 1.0 - (distance / max_distance))
1✔
353

354
            # Calculate alignment score (how well left edges align)
355
            max_alignment_diff = step_bbox.width * ALIGNMENT_THRESHOLD_MULTIPLIER
1✔
356
            left_diff = abs(parts_list_bbox.x0 - step_bbox.x0)
1✔
357
            if max_alignment_diff > 0:
1✔
358
                alignment_score = max(0.0, 1.0 - (left_diff / max_alignment_diff))
1✔
359

360
        # Create score object with candidate references
361
        # Diagrams are found at build time, not during scoring
362
        score = _StepScore(
1✔
363
            step_number_candidate=step_candidate,
364
            parts_list_candidate=parts_list_candidate,
365
            has_parts_list=parts_list_candidate is not None,
366
            step_proximity_score=proximity_score,
367
            step_alignment_score=alignment_score,
368
        )
369

370
        # Calculate combined bbox for the candidate (without diagram)
371
        combined_bbox = step_bbox
1✔
372
        if parts_list_bbox:
1✔
373
            combined_bbox = BBox.union(combined_bbox, parts_list_bbox)
1✔
374

375
        # Create candidate
376
        return Candidate(
1✔
377
            bbox=combined_bbox,
378
            label="step",
379
            score=score.overall_score(),
380
            score_details=score,
381
            source_blocks=[],
382
        )
383

384
    def _score_step_diagram_pair(
1✔
385
        self,
386
        step_bbox: BBox,
387
        diagram_bbox: BBox,
388
    ) -> float:
389
        """Score how well a diagram matches a step.
390

391
        Diagrams are typically positioned to the right of and/or below the step
392
        number. This method scores based on:
393
        - Horizontal position: prefer diagrams to the right, penalize left
394
        - Vertical position: prefer diagrams below the step header
395
        - Distance: closer is better
396

397
        Args:
398
            step_bbox: The step number bounding box
399
            diagram_bbox: The diagram bounding box to score
400

401
        Returns:
402
            Score between 0.0 and 1.0 (higher is better match)
403
        """
404
        # Reference point: bottom-right of step number
405
        ref_x = step_bbox.x1
1✔
406
        ref_y = step_bbox.y1
1✔
407

408
        # TODO Move all these constants into config, or make them adaptive?
409

410
        # Horizontal score
411
        # Diagrams to the right are preferred, but allow some overlap
412
        x_offset = diagram_bbox.x0 - ref_x
1✔
413

414
        if x_offset >= -50:
1✔
415
            # Diagram starts to the right or slightly overlapping - good
416
            # Score decreases slightly with distance to the right
417
            x_score = max(0.5, 1.0 - abs(x_offset) / 400.0)
1✔
418
        elif x_offset >= -200:
1✔
419
            # Diagram is moderately to the left - acceptable
420
            x_score = 0.3 + 0.2 * (1.0 + x_offset / 200.0)
1✔
421
        else:
422
            # Diagram is far to the left - poor match
423
            x_score = max(0.1, 0.3 + x_offset / 400.0)
1✔
424

425
        # Vertical score
426
        # Diagrams below the step header are preferred
427
        y_offset = diagram_bbox.y0 - ref_y
1✔
428

429
        if y_offset >= -30:
1✔
430
            # Diagram starts below or slightly overlapping - good
431
            # Score decreases with vertical distance
432
            y_score = max(0.3, 1.0 - abs(y_offset) / 300.0)
1✔
433
        elif y_offset >= -100:
1✔
434
            # Diagram is moderately above - less good but acceptable
435
            y_score = 0.2 + 0.3 * (1.0 + y_offset / 100.0)
1✔
436
        else:
437
            # Diagram is far above - poor match
438
            y_score = max(0.05, 0.2 + y_offset / 300.0)
1✔
439

440
        # Combined score - weight both dimensions equally
441
        score = 0.5 * x_score + 0.5 * y_score
1✔
442

443
        return score
1✔
444

445
    def _get_rotation_symbol_for_step(
1✔
446
        self,
447
        step_num: StepNumber,
448
        diagram_or_candidate: Diagram | Candidate | None,
449
        result: ClassificationResult,
450
    ) -> RotationSymbol | None:
451
        """Find rotation symbol associated with this step.
452

453
        Looks for rotation symbol candidates that are positioned near the
454
        step's diagram or step number. Returns the highest-scored candidate
455
        if multiple are found.
456

457
        Args:
458
            step_num: The step number element
459
            diagram_or_candidate: The diagram element or candidate (if any).
460
                Can be either a built Diagram or an unbuilt Candidate with a bbox.
461
            result: Classification result containing rotation symbol candidates
462

463
        Returns:
464
            Single RotationSymbol element for this step, or None if not found
465
        """
466
        rotation_symbol_candidates = result.get_scored_candidates(
1✔
467
            "rotation_symbol", valid_only=False, exclude_failed=True
468
        )
469

470
        log.debug(
1✔
471
            "[step] Looking for rotation symbols for step %d, found %d candidates",
472
            step_num.value,
473
            len(rotation_symbol_candidates),
474
        )
475

476
        if not rotation_symbol_candidates:
1✔
477
            return None
1✔
478

479
        # Determine search region: prefer diagram area, fallback to step area
480
        # Accept both Diagram elements and Candidate objects (both have .bbox)
481
        if diagram_or_candidate is not None:
1✔
482
            search_bbox = diagram_or_candidate.bbox
×
483
        else:
484
            search_bbox = step_num.bbox
1✔
485

486
        # Expand search region to catch nearby symbols
487
        search_region = BBox(
1✔
488
            x0=search_bbox.x0 - 50,
489
            y0=search_bbox.y0 - 50,
490
            x1=search_bbox.x1 + 50,
491
            y1=search_bbox.y1 + 50,
492
        )
493

494
        log.debug(
1✔
495
            "[step] Search region for step %d: %s",
496
            step_num.value,
497
            search_region,
498
        )
499

500
        # Find rotation symbols within or overlapping the search region
501
        # Keep track of best candidate by score
502
        best_candidate = None
1✔
503
        best_score = 0.0
1✔
504
        for candidate in rotation_symbol_candidates:
1✔
505
            overlaps = candidate.bbox.overlaps(search_region)
1✔
506
            log.debug(
1✔
507
                "[step]   Candidate at %s, overlaps=%s, score=%.2f",
508
                candidate.bbox,
509
                overlaps,
510
                candidate.score,
511
            )
512
            if overlaps and candidate.score > best_score:
1✔
513
                best_candidate = candidate
1✔
514
                best_score = candidate.score
1✔
515

516
        if best_candidate:
1✔
517
            rotation_symbol = result.build(best_candidate)
1✔
518
            assert isinstance(rotation_symbol, RotationSymbol)
1✔
519
            log.debug(
1✔
520
                "[step] Found rotation symbol for step %d (score=%.2f)",
521
                step_num.value,
522
                best_score,
523
            )
524
            return rotation_symbol
1✔
525

526
        log.debug("[step] No rotation symbol found for step %d", step_num.value)
1✔
527
        return None
1✔
528

529
    def _get_arrows_for_step(
1✔
530
        self,
531
        step_num: StepNumber,
532
        diagram: Diagram | None,
533
        result: ClassificationResult,
534
    ) -> list[Arrow]:
535
        """Find arrows associated with this step.
536

537
        Looks for arrow candidates that are positioned near the step's diagram
538
        or step number. Typically these are arrows pointing from subassembly
539
        callout boxes to the main diagram.
540

541
        Args:
542
            step_num: The step number element
543
            diagram: The diagram element (if any)
544
            result: Classification result containing arrow candidates
545

546
        Returns:
547
            List of Arrow elements for this step
548
        """
549
        arrow_candidates = result.get_scored_candidates(
1✔
550
            "arrow", valid_only=False, exclude_failed=True
551
        )
552

553
        log.debug(
1✔
554
            "[step] Looking for arrows for step %d, found %d candidates",
555
            step_num.value,
556
            len(arrow_candidates),
557
        )
558

559
        if not arrow_candidates:
1✔
560
            return []
1✔
561

562
        # Determine search region: prefer diagram area, fallback to step area
563
        search_bbox = diagram.bbox if diagram else step_num.bbox
1✔
564

565
        # Expand search region to catch arrows near the diagram
566
        # Use a larger margin than rotation symbols since arrows can extend further
567
        search_region = BBox(
1✔
568
            x0=search_bbox.x0 - 100,
569
            y0=search_bbox.y0 - 100,
570
            x1=search_bbox.x1 + 100,
571
            y1=search_bbox.y1 + 100,
572
        )
573

574
        log.debug(
1✔
575
            "[step] Arrow search region for step %d: %s",
576
            step_num.value,
577
            search_region,
578
        )
579

580
        # Find arrows within or overlapping the search region
581
        arrows: list[Arrow] = []
1✔
582
        for candidate in arrow_candidates:
1✔
583
            overlaps = candidate.bbox.overlaps(search_region)
1✔
584
            log.debug(
1✔
585
                "[step]   Arrow candidate at %s, overlaps=%s, score=%.2f",
586
                candidate.bbox,
587
                overlaps,
588
                candidate.score,
589
            )
590
            if overlaps:
1✔
591
                try:
1✔
592
                    arrow = result.build(candidate)
1✔
593
                    assert isinstance(arrow, Arrow)
1✔
594
                    arrows.append(arrow)
1✔
595
                except CandidateFailedError:
1✔
596
                    # Arrow lost conflict to another arrow (they share source blocks)
597
                    # This is expected when multiple arrows overlap - skip it
598
                    log.debug(
1✔
599
                        "[step]   Arrow candidate at %s failed (conflict), skipping",
600
                        candidate.bbox,
601
                    )
602
                    continue
1✔
603

604
        log.debug(
1✔
605
            "[step] Found %d arrows for step %d",
606
            len(arrows),
607
            step_num.value,
608
        )
609
        return arrows
1✔
610

611
    def _get_subassemblies_for_step(
1✔
612
        self,
613
        step_num: StepNumber,
614
        diagram: Diagram | None,
615
        result: ClassificationResult,
616
    ) -> list[SubAssembly]:
617
        """Find subassemblies associated with this step.
618

619
        Looks for subassembly candidates that are positioned near the step's
620
        diagram or step number. SubAssemblies are callout boxes showing
621
        sub-assemblies.
622

623
        Args:
624
            step_num: The step number element
625
            diagram: The diagram element (if any)
626
            result: Classification result containing subassembly candidates
627

628
        Returns:
629
            List of SubAssembly elements for this step
630
        """
631
        subassembly_candidates = result.get_scored_candidates(
1✔
632
            "subassembly", valid_only=False, exclude_failed=True
633
        )
634

635
        log.debug(
1✔
636
            "[step] Looking for subassemblies for step %d, found %d candidates",
637
            step_num.value,
638
            len(subassembly_candidates),
639
        )
640

641
        if not subassembly_candidates:
1✔
642
            return []
1✔
643

644
        # Determine search region: prefer diagram area, fallback to step area
645
        search_bbox = diagram.bbox if diagram else step_num.bbox
1✔
646

647
        # Expand search region to catch subassemblies near the diagram
648
        # Use a larger margin since subassemblies can be positioned further from
649
        # the main diagram
650
        search_region = BBox(
1✔
651
            x0=search_bbox.x0 - 150,
652
            y0=search_bbox.y0 - 150,
653
            x1=search_bbox.x1 + 150,
654
            y1=search_bbox.y1 + 150,
655
        )
656

657
        log.debug(
1✔
658
            "[step] SubAssembly search region for step %d: %s",
659
            step_num.value,
660
            search_region,
661
        )
662

663
        # Find subassemblies within or overlapping the search region
664
        subassemblies: list[SubAssembly] = []
1✔
665

666
        for candidate in subassembly_candidates:
1✔
667
            overlaps = candidate.bbox.overlaps(search_region)
1✔
668
            log.debug(
1✔
669
                "[step]   SubAssembly candidate at %s, overlaps=%s, score=%.2f",
670
                candidate.bbox,
671
                overlaps,
672
                candidate.score,
673
            )
674
            if overlaps:
1✔
675
                try:
1✔
676
                    subassembly = result.build(candidate)
1✔
677
                    assert isinstance(subassembly, SubAssembly)
1✔
678
                    subassemblies.append(subassembly)
1✔
679
                except Exception as e:
1✔
680
                    log.debug(
1✔
681
                        "[step]   Failed to build subassembly at %s: %s",
682
                        candidate.bbox,
683
                        e,
684
                    )
685

686
        log.debug(
1✔
687
            "[step] Found %d subassemblies for step %d",
688
            len(subassemblies),
689
            step_num.value,
690
        )
691
        return subassemblies
1✔
692

693
    def _compute_step_bbox(
1✔
694
        self,
695
        step_num: StepNumber,
696
        parts_list: PartsList | None,
697
        diagram: Diagram | None,
698
        page_bbox: BBox,
699
    ) -> BBox:
700
        """Compute the overall bounding box for the Step.
701

702
        This encompasses the step number, parts list (if any), and diagram (if any).
703
        The result is clipped to the page bounds to handle elements that extend
704
        slightly off-page (e.g., arrows in diagrams).
705

706
        Args:
707
            step_num: The step number element
708
            parts_list: The parts list (if any)
709
            diagram: The diagram element (if any)
710
            page_bbox: The page bounding box to clip to
711

712
        Returns:
713
            Combined bounding box, clipped to page bounds
714
        """
715
        bboxes = [step_num.bbox]
1✔
716
        if parts_list:
1✔
717
            bboxes.append(parts_list.bbox)
1✔
718
        if diagram:
1✔
719
            bboxes.append(diagram.bbox)
1✔
720

721
        return BBox.union_all(bboxes).clip_to(page_bbox)
1✔
722

723
    def _deduplicate_and_assign_diagrams(
1✔
724
        self, candidates: list[Candidate], result: ClassificationResult
725
    ) -> list[Candidate]:
726
        """Select the best Step candidates, ensuring each step number is unique.
727

728
        Diagrams are found at build time, not during scoring, to allow
729
        rotation symbols to claim small images first.
730

731
        Args:
732
            candidates: All possible Step candidates
733
            result: Classification result (unused, kept for API compatibility)
734

735
        Returns:
736
            Deduplicated list of Step candidates (one per step number value)
737
        """
738
        # First, deduplicate candidates by step number value
739
        # Pick the best candidate for each unique step number
740
        best_by_step_value: dict[int, Candidate] = {}
1✔
741

742
        for candidate in candidates:
1✔
743
            assert isinstance(candidate.score_details, _StepScore)
1✔
744
            score = candidate.score_details
1✔
745

746
            # Extract step number value
747
            step_num_candidate = score.step_number_candidate
1✔
748
            if not step_num_candidate.source_blocks:
1✔
749
                continue
×
750
            text_block = step_num_candidate.source_blocks[0]
1✔
751
            if not isinstance(text_block, Text):
1✔
752
                continue
×
753

754
            step_value = extract_step_number_value(text_block.text)
1✔
755
            if step_value is None:
1✔
756
                continue
×
757

758
            # Keep the best candidate for each step value
759
            if step_value not in best_by_step_value:
1✔
760
                best_by_step_value[step_value] = candidate
1✔
761
            else:
762
                existing = best_by_step_value[step_value]
1✔
763
                if candidate.score > existing.score:
1✔
764
                    best_by_step_value[step_value] = candidate
1✔
765

766
        # Get unique step candidates
767
        unique_step_candidates = list(best_by_step_value.values())
1✔
768

769
        if not unique_step_candidates:
1✔
770
            return []
×
771

772
        # Build final candidates ensuring parts list uniqueness
773
        selected: list[Candidate] = []
1✔
774
        used_parts_list_ids: set[int] = set()
1✔
775

776
        for candidate in unique_step_candidates:
1✔
777
            assert isinstance(candidate.score_details, _StepScore)
1✔
778
            score = candidate.score_details
1✔
779

780
            # Check parts list uniqueness
781
            parts_list_candidate = score.parts_list_candidate
1✔
782
            if parts_list_candidate is not None:
1✔
783
                has_parts = False
1✔
784
                if isinstance(parts_list_candidate.score_details, _PartsListScore):
1✔
785
                    has_parts = (
1✔
786
                        len(parts_list_candidate.score_details.part_candidates) > 0
787
                    )
788

789
                if has_parts:
1✔
790
                    parts_list_id = id(parts_list_candidate)
1✔
791
                    if parts_list_id in used_parts_list_ids:
1✔
792
                        # Use None for parts list if already used
793
                        parts_list_candidate = None
1✔
794
                    else:
795
                        used_parts_list_ids.add(parts_list_id)
1✔
796

797
            # Create updated score if parts_list changed
798
            if parts_list_candidate != score.parts_list_candidate:
1✔
799
                updated_score = _StepScore(
1✔
800
                    step_number_candidate=score.step_number_candidate,
801
                    parts_list_candidate=parts_list_candidate,
802
                    has_parts_list=parts_list_candidate is not None,
803
                    step_proximity_score=score.step_proximity_score,
804
                    step_alignment_score=score.step_alignment_score,
805
                )
806
                candidate = Candidate(
1✔
807
                    bbox=candidate.bbox,
808
                    label=candidate.label,
809
                    score=updated_score.overall_score(),
810
                    score_details=updated_score,
811
                    source_blocks=candidate.source_blocks,
812
                )
813

814
            selected.append(candidate)
1✔
815

816
            # Log selection
817
            text_block = score.step_number_candidate.source_blocks[0]
1✔
818
            assert isinstance(text_block, Text)
1✔
819
            step_value = extract_step_number_value(text_block.text)
1✔
820
            log.debug(
1✔
821
                "[step] Selected step %d (parts_list=%s, score=%.2f)",
822
                step_value or 0,
823
                "yes" if parts_list_candidate is not None else "no",
824
                candidate.score,
825
            )
826

827
        return selected
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc