• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 20158146015

12 Dec 2025 06:14AM UTC coverage: 90.787% (+0.3%) from 90.479%
20158146015

push

github

bramp
Ignore profile output.

12594 of 13872 relevant lines covered (90.79%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

77.73
/src/build_a_long/pdf_extract/classifier/steps/step_classifier.py
1
"""
2
Step classifier.
3

4
Purpose
5
-------
6
Identify complete Step structures by combining step_number, parts_list, and diagram
7
elements. A Step represents a single building instruction comprising:
8
- A StepNumber label
9
- An optional PartsList (the parts needed for this step)
10
- A Diagram (the main instruction graphic showing what to build)
11

12
We look for step_numbers and attempt to pair them with nearby parts_lists and
13
identify the appropriate diagram region for each step.
14

15
Debugging
16
---------
17
Set environment variables to aid investigation without code changes:
18

19
- LOG_LEVEL=DEBUG
20
    Enables DEBUG-level logging (if not already configured by caller).
21
"""
22

23
import logging
1✔
24

25
import numpy as np
1✔
26
from scipy.optimize import linear_sum_assignment
1✔
27

28
from build_a_long.pdf_extract.classifier.candidate import Candidate
1✔
29
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
30
    CandidateFailedError,
31
    ClassificationResult,
32
)
33
from build_a_long.pdf_extract.classifier.label_classifier import (
1✔
34
    LabelClassifier,
35
)
36
from build_a_long.pdf_extract.classifier.score import (
1✔
37
    Score,
38
    Weight,
39
    find_best_scoring,
40
)
41
from build_a_long.pdf_extract.classifier.text import (
1✔
42
    extract_step_number_value,
43
)
44
from build_a_long.pdf_extract.extractor.bbox import BBox, filter_overlapping
1✔
45
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
46
    Arrow,
47
    Diagram,
48
    Divider,
49
    LegoPageElements,
50
    PartsList,
51
    RotationSymbol,
52
    Step,
53
    StepNumber,
54
    SubAssembly,
55
)
56
from build_a_long.pdf_extract.extractor.page_blocks import Text
1✔
57

58
log = logging.getLogger(__name__)
1✔
59

60

61
class _StepScore(Score):
1✔
62
    """Internal score representation for step classification."""
63

64
    step_value: int
65
    """The parsed step number value (e.g., 1, 2, 3)."""
1✔
66

67
    step_number_candidate: Candidate
68
    """The step number candidate this step is associated with."""
1✔
69

70
    parts_list_candidate: Candidate | None
71
    """The parts list candidate paired with this step (if any)."""
1✔
72

73
    has_parts_list: bool
74
    """Whether this step has an associated parts list."""
1✔
75

76
    step_proximity_score: float
77
    """Score based on proximity to the PartsList above (0.0-1.0).
1✔
78
    1.0 for closest proximity, 0.0 if very far. 0.0 if no parts list."""
79

80
    step_alignment_score: float
81
    """Score based on left-edge alignment with PartsList above (0.0-1.0).
1✔
82
    1.0 is perfect alignment, 0.0 is very misaligned. 0.0 if no parts list."""
83

84
    def score(self) -> Weight:
1✔
85
        """Return the overall pairing score."""
86
        return self.overall_score()
×
87

88
    def overall_score(self) -> float:
1✔
89
        """Calculate overall quality score based on parts list pairing.
90

91
        Steps with a parts_list are given a bonus to prefer them over
92
        steps without parts_list. Diagrams are found at build time, not
93
        during scoring, to allow rotation symbols to claim small images first.
94
        """
95
        if self.has_parts_list:
1✔
96
            # Base score for having parts_list + proximity/alignment bonus
97
            parts_list_bonus = 0.5
1✔
98
            pairing_score = (
1✔
99
                self.step_proximity_score + self.step_alignment_score
100
            ) / 2.0
101
            return parts_list_bonus + 0.5 * pairing_score
1✔
102
        return 0.3  # Lower base score for steps without parts list
1✔
103

104
    def sort_key(self) -> tuple[float, int]:
1✔
105
        """Return a tuple for sorting candidates.
106

107
        We prefer:
108
        1. Higher overall scores (better StepNumber-PartsList-Diagram match)
109
        2. Lower step number values (to break ties and maintain order)
110
        """
111
        return (-self.overall_score(), self.step_value)
1✔
112

113

114
class StepClassifier(LabelClassifier):
1✔
115
    """Classifier for complete Step structures."""
116

117
    output = "step"
1✔
118
    requires = frozenset(
1✔
119
        {
120
            "arrow",
121
            "step_number",
122
            "parts_list",
123
            "diagram",
124
            "rotation_symbol",
125
            "subassembly",
126
            "preview",
127
        }
128
    )
129

130
    def _score(self, result: ClassificationResult) -> None:
1✔
131
        """Score step pairings and create candidates."""
132
        page_data = result.page_data
1✔
133

134
        # Get step number and parts list candidates (not constructed elements)
135
        step_number_candidates = result.get_scored_candidates(
1✔
136
            "step_number", valid_only=False, exclude_failed=True
137
        )
138

139
        if not step_number_candidates:
1✔
140
            return
1✔
141

142
        # Get parts_list candidates
143
        parts_list_candidates = result.get_scored_candidates(
1✔
144
            "parts_list",
145
            valid_only=False,
146
            exclude_failed=True,
147
        )
148

149
        log.debug(
1✔
150
            "[step] page=%s step_candidates=%d parts_list_candidates=%d",
151
            page_data.page_number,
152
            len(step_number_candidates),
153
            len(parts_list_candidates),
154
        )
155

156
        # Create all possible Step candidates for pairings (without diagrams initially)
157
        # Deduplication happens at build time, not scoring time, so that
158
        # diagram assignment can be re-evaluated as blocks get claimed.
159
        all_candidates: list[Candidate] = []
1✔
160
        for step_candidate in step_number_candidates:
1✔
161
            # Create candidates for this StepNumber paired with each PartsList
162
            for parts_list_candidate in parts_list_candidates:
1✔
163
                candidate = self._create_step_candidate(
1✔
164
                    step_candidate, parts_list_candidate, result
165
                )
166
                if candidate:
1✔
167
                    all_candidates.append(candidate)
1✔
168

169
            # Also create a candidate with no PartsList (fallback)
170
            candidate = self._create_step_candidate(step_candidate, None, result)
1✔
171
            if candidate:
1✔
172
                all_candidates.append(candidate)
1✔
173

174
        # Add all candidates to result (deduplication happens at build time)
175
        for candidate in all_candidates:
1✔
176
            result.add_candidate(candidate)
1✔
177

178
        log.debug(
1✔
179
            "[step] Created %d step candidates",
180
            len(all_candidates),
181
        )
182

183
    def build_all(self, result: ClassificationResult) -> list[LegoPageElements]:
1✔
184
        """Build all Step elements with coordinated element assignment.
185

186
        Build Order (phases run sequentially):
187
        - Phase 1: Rotation symbols - claim small circular arrow graphics
188
        - Phase 2: Parts lists - claim part count blocks and part images
189
        - Phase 3: Arrows - claim arrowhead triangles and shaft lines/rectangles
190
        - Phase 4: Subassemblies & Previews - claim white callout boxes
191
        - Phase 5: Steps - build step_number + parts_list pairs (no diagram yet)
192
        - Phase 6: Diagrams - build remaining unclaimed diagram regions
193
        - Phase 7: Assign diagrams to steps via Hungarian matching
194
        - Phase 8: Assign subassemblies to steps via Hungarian matching
195
        - Phase 9: Finalize steps - add arrows, compute final bboxes
196
        - Phase 10: Assign rotation symbols to steps via Hungarian matching
197

198
        The order matters because elements compete for the same Drawing blocks.
199
        Earlier phases claim blocks first, preventing later phases from using them.
200
        For example:
201
        - Arrows must build before subassemblies so arrow shafts aren't consumed
202
          by subassembly white boxes
203
        - Rotation symbols must build before diagrams so small circular graphics
204
          aren't incorrectly clustered into diagrams
205

206
        Deduplication of steps happens at build time (not scoring time) so that
207
        diagram assignment can be re-evaluated as blocks get claimed.
208

209
        Returns:
210
            List of successfully constructed Step elements, sorted by step number
211
        """
212
        # Phase 1: Build all rotation symbols BEFORE steps.
213
        # This allows rotation symbols to claim their Drawing blocks first,
214
        # preventing them from being incorrectly clustered into diagrams.
215
        for rs_candidate in result.get_scored_candidates(
1✔
216
            "rotation_symbol", valid_only=False, exclude_failed=True
217
        ):
218
            try:
1✔
219
                result.build(rs_candidate)
1✔
220
                log.debug(
1✔
221
                    "[step] Built rotation symbol at %s (score=%.2f)",
222
                    rs_candidate.bbox,
223
                    rs_candidate.score,
224
                )
225
            except Exception as e:
×
226
                log.debug(
×
227
                    "[step] Failed to construct rotation_symbol candidate at %s: %s",
228
                    rs_candidate.bbox,
229
                    e,
230
                )
231

232
        # Phase 2: Build all parts lists BEFORE steps.
233
        # This allows parts lists to claim their Drawing blocks first,
234
        # preventing them from being claimed by subassemblies.
235
        for pl_candidate in result.get_scored_candidates(
1✔
236
            "parts_list", valid_only=False, exclude_failed=True
237
        ):
238
            try:
1✔
239
                result.build(pl_candidate)
1✔
240
                log.debug(
1✔
241
                    "[step] Built parts_list at %s (score=%.2f)",
242
                    pl_candidate.bbox,
243
                    pl_candidate.score,
244
                )
245
            except Exception as e:
×
246
                log.debug(
×
247
                    "[step] Failed to construct parts_list candidate at %s: %s",
248
                    pl_candidate.bbox,
249
                    e,
250
                )
251

252
        # Phase 3: Build all arrows BEFORE subassemblies.
253
        # Arrows connect subassembly callout boxes to main diagrams. The arrow
254
        # shaft may be a thin rectangle that overlaps the subassembly's white
255
        # box. By building arrows first, they claim the shaft blocks before
256
        # subassemblies can consume them.
257
        for arrow_candidate in result.get_scored_candidates(
1✔
258
            "arrow", valid_only=False, exclude_failed=True
259
        ):
260
            try:
1✔
261
                result.build(arrow_candidate)
1✔
262
                log.debug(
1✔
263
                    "[step] Built arrow at %s (score=%.2f)",
264
                    arrow_candidate.bbox,
265
                    arrow_candidate.score,
266
                )
267
            except Exception as e:
×
268
                log.debug(
×
269
                    "[step] Failed to construct arrow candidate at %s: %s",
270
                    arrow_candidate.bbox,
271
                    e,
272
                )
273

274
        # Phase 4: Build subassemblies and previews BEFORE steps.
275
        # Both subassemblies and previews are white boxes with diagrams inside.
276
        # We combine them and build in score order so the higher-scoring
277
        # candidate claims the white box first. When a candidate is built,
278
        # its source_blocks are marked as consumed, causing any competing
279
        # candidate using the same blocks to fail.
280
        #
281
        # This allows subassemblies (which have step_count labels like "2x")
282
        # to be distinguished from previews (which appear before steps).
283
        subassembly_candidates = result.get_scored_candidates(
1✔
284
            "subassembly", valid_only=False, exclude_failed=True
285
        )
286
        preview_candidates = result.get_scored_candidates(
1✔
287
            "preview", valid_only=False, exclude_failed=True
288
        )
289

290
        # Combine and sort by score (highest first)
291
        combined_candidates = list(subassembly_candidates) + list(preview_candidates)
1✔
292
        combined_candidates.sort(key=lambda c: c.score, reverse=True)
1✔
293

294
        for candidate in combined_candidates:
1✔
295
            try:
1✔
296
                result.build(candidate)
1✔
297
                log.debug(
1✔
298
                    "[step] Built %s at %s (score=%.2f)",
299
                    candidate.label,
300
                    candidate.bbox,
301
                    candidate.score,
302
                )
303
            except Exception as e:
1✔
304
                log.debug(
1✔
305
                    "[step] Failed to construct %s candidate at %s: %s",
306
                    candidate.label,
307
                    candidate.bbox,
308
                    e,
309
                )
310

311
        # Phase 5: Build Step candidates with deduplication by step value
312
        # Filter out subassembly steps, then build in score order, skipping
313
        # step values that have already been built.
314
        # Steps are built as "partial" - just step_number + parts_list.
315
        # Diagrams and subassemblies are assigned later via Hungarian matching.
316
        all_step_candidates = result.get_scored_candidates(
1✔
317
            "step", valid_only=False, exclude_failed=True
318
        )
319
        page_level_step_candidates = self._filter_page_level_step_candidates(
1✔
320
            all_step_candidates
321
        )
322

323
        # Sort by score (highest first) so best candidates get built first
324
        sorted_candidates = sorted(
1✔
325
            page_level_step_candidates,
326
            key=lambda c: c.score,
327
            reverse=True,
328
        )
329

330
        steps: list[Step] = []
1✔
331
        built_step_values: set[int] = set()
1✔
332

333
        for step_candidate in sorted_candidates:
1✔
334
            # Get step value from score
335
            score = step_candidate.score_details
1✔
336
            if not isinstance(score, _StepScore):
1✔
337
                continue
×
338

339
            # Skip if we've already built a step with this value
340
            if score.step_value in built_step_values:
1✔
341
                log.debug(
1✔
342
                    "[step] Skipping duplicate step %d candidate (score=%.2f)",
343
                    score.step_value,
344
                    step_candidate.score,
345
                )
346
                continue
1✔
347

348
            try:
1✔
349
                elem = result.build(step_candidate)
1✔
350
                assert isinstance(elem, Step)
1✔
351
                steps.append(elem)
1✔
352
                built_step_values.add(score.step_value)
1✔
353
                log.debug(
1✔
354
                    "[step] Built partial step %d (parts_list=%s, score=%.2f)",
355
                    score.step_value,
356
                    "yes" if score.parts_list_candidate is not None else "no",
357
                    step_candidate.score,
358
                )
359
            except Exception as e:
1✔
360
                log.debug(
1✔
361
                    "[step] Failed to construct step %d candidate: %s",
362
                    score.step_value,
363
                    e,
364
                )
365

366
        # Sort steps by their step_number value
367
        steps.sort(key=lambda step: step.step_number.value)
1✔
368

369
        # Phase 6: Build diagrams (not yet claimed by subassemblies)
370
        # Phase 7: Assign diagrams to steps using Hungarian matching
371
        # Collect available diagram candidates (not yet claimed by subassemblies)
372
        available_diagrams: list[Diagram] = []
1✔
373
        for diag_candidate in result.get_scored_candidates(
1✔
374
            "diagram", valid_only=False, exclude_failed=True
375
        ):
376
            if diag_candidate.constructed is None:
1✔
377
                # Build the diagram
378
                try:
1✔
379
                    diag_elem = result.build(diag_candidate)
1✔
380
                    assert isinstance(diag_elem, Diagram)
1✔
381
                    available_diagrams.append(diag_elem)
1✔
382
                except Exception as e:
1✔
383
                    log.debug(
1✔
384
                        "[step] Failed to build diagram at %s: %s",
385
                        diag_candidate.bbox,
386
                        e,
387
                    )
388
            elif isinstance(diag_candidate.constructed, Diagram):
1✔
389
                # Already built (claimed by subassembly) - skip
390
                pass
1✔
391

392
        log.debug(
1✔
393
            "[step] Available diagrams for step assignment: %d",
394
            len(available_diagrams),
395
        )
396

397
        # Assign diagrams to steps using Hungarian matching
398
        assign_diagrams_to_steps(steps, available_diagrams)
1✔
399

400
        # Phase 8: Assign subassemblies to steps using Hungarian matching
401
        # Collect built subassemblies
402
        subassemblies: list[SubAssembly] = []
1✔
403
        for sa_candidate in result.get_scored_candidates(
1✔
404
            "subassembly", valid_only=True
405
        ):
406
            assert sa_candidate.constructed is not None
1✔
407
            assert isinstance(sa_candidate.constructed, SubAssembly)
1✔
408
            subassemblies.append(sa_candidate.constructed)
1✔
409

410
        # Collect built dividers for obstruction checking
411
        dividers: list[Divider] = []
1✔
412
        for div_candidate in result.get_scored_candidates("divider", valid_only=True):
1✔
413
            assert div_candidate.constructed is not None
1✔
414
            assert isinstance(div_candidate.constructed, Divider)
1✔
415
            dividers.append(div_candidate.constructed)
1✔
416

417
        log.debug(
1✔
418
            "[step] Assigning %d subassemblies to %d steps (%d dividers for "
419
            "obstruction checking)",
420
            len(subassemblies),
421
            len(steps),
422
            len(dividers),
423
        )
424

425
        # Assign subassemblies to steps using Hungarian matching
426
        assign_subassemblies_to_steps(steps, subassemblies, dividers)
1✔
427

428
        # Phase 9: Finalize steps - collect arrows and compute final bboxes
429
        page_bbox = result.page_data.bbox
1✔
430
        for step in steps:
1✔
431
            # Get arrows for this step
432
            arrows = self._get_arrows_for_step(step.step_number, step.diagram, result)
1✔
433

434
            # Compute final bbox including all components
435
            final_bbox = self._compute_step_bbox(
1✔
436
                step.step_number,
437
                step.parts_list,
438
                step.diagram,
439
                step.subassemblies,
440
                page_bbox,
441
            )
442

443
            # Update the step (Step is a Pydantic model, so we need to reassign)
444
            # We mutate in place by directly setting attributes
445
            object.__setattr__(step, "arrows", arrows)
1✔
446
            object.__setattr__(step, "bbox", final_bbox)
1✔
447

448
        # Phase 10: Assign rotation symbols to steps using Hungarian matching
449
        # Collect built rotation symbols
450
        rotation_symbols: list[RotationSymbol] = []
1✔
451
        for rs_candidate in result.get_scored_candidates(
1✔
452
            "rotation_symbol", valid_only=True
453
        ):
454
            assert rs_candidate.constructed is not None
1✔
455
            assert isinstance(rs_candidate.constructed, RotationSymbol)
1✔
456
            rotation_symbols.append(rs_candidate.constructed)
1✔
457

458
        assign_rotation_symbols_to_steps(steps, rotation_symbols)
1✔
459

460
        log.debug(
1✔
461
            "[step] build_all complete: %d steps, %d rotation symbols assigned",
462
            len(steps),
463
            sum(1 for s in steps if s.rotation_symbol is not None),
464
        )
465

466
        # Cast for type compatibility with base class return type
467
        return list[LegoPageElements](steps)
1✔
468

469
    def _filter_page_level_step_candidates(
1✔
470
        self, candidates: list[Candidate]
471
    ) -> list[Candidate]:
472
        """Filter step candidates to exclude likely subassembly steps.
473

474
        Extracts step number values from candidates and uses the generic
475
        filter_subassembly_values function to filter out subassembly steps.
476

477
        Args:
478
            candidates: All step candidates to filter
479

480
        Returns:
481
            Filtered list of candidates likely to be page-level steps
482
        """
483
        # Extract step number values from candidates
484
        items_with_values: list[tuple[int, Candidate]] = []
1✔
485
        for candidate in candidates:
1✔
486
            score = candidate.score_details
1✔
487
            if not isinstance(score, _StepScore):
1✔
488
                continue
×
489
            items_with_values.append((score.step_value, candidate))
1✔
490

491
        # Use generic filtering function
492
        filtered_items = filter_subassembly_values(items_with_values)
1✔
493

494
        # If filtering occurred, return only the filtered candidates
495
        if len(filtered_items) != len(items_with_values):
1✔
496
            filtered_values = [v for v, _ in filtered_items]
1✔
497
            log.debug(
1✔
498
                "[step] Filtered out likely subassembly steps, keeping values: %s",
499
                sorted(filtered_values),
500
            )
501
            return [item for _, item in filtered_items]
1✔
502

503
        # No filtering occurred, return original candidates
504
        # (includes any candidates that couldn't have their value extracted)
505
        return candidates
1✔
506

507
    def build(self, candidate: Candidate, result: ClassificationResult) -> Step:
1✔
508
        """Construct a partial Step element from a single candidate.
509

510
        This creates a Step with just step_number and parts_list. The diagram,
511
        subassemblies, and arrows are assigned later in build_all() using
512
        Hungarian matching for optimal global assignment.
513
        """
514
        score = candidate.score_details
1✔
515
        assert isinstance(score, _StepScore)
1✔
516

517
        # Validate and extract step number from parent candidate
518
        step_num_candidate = score.step_number_candidate
1✔
519

520
        step_num_elem = result.build(step_num_candidate)
1✔
521
        assert isinstance(step_num_elem, StepNumber)
1✔
522
        step_num = step_num_elem
1✔
523

524
        # Validate and extract parts list from parent candidate (if present)
525
        parts_list = None
1✔
526
        if score.parts_list_candidate:
1✔
527
            parts_list_candidate = score.parts_list_candidate
1✔
528
            parts_list_elem = result.build(parts_list_candidate)
1✔
529
            assert isinstance(parts_list_elem, PartsList)
1✔
530
            parts_list = parts_list_elem
1✔
531

532
        # Create partial step - diagram, subassemblies, arrows assigned later
533
        initial_bbox = step_num.bbox
1✔
534
        if parts_list:
1✔
535
            initial_bbox = initial_bbox.union(parts_list.bbox)
1✔
536

537
        return Step(
1✔
538
            bbox=initial_bbox,
539
            step_number=step_num,
540
            parts_list=parts_list,
541
            diagram=None,
542
            rotation_symbol=None,
543
            arrows=[],
544
            subassemblies=[],
545
        )
546

547
    def _find_and_build_diagram_for_step(
1✔
548
        self,
549
        step_num: StepNumber,
550
        parts_list: PartsList | None,
551
        result: ClassificationResult,
552
    ) -> Diagram | None:
553
        """Find and build the best diagram for this step.
554

555
        This is called at build time, after rotation symbols and subassemblies
556
        have been built (in build_all Phases 1 and 3), so they've already
557
        claimed any images/diagrams they need. We filter to only consider
558
        diagram candidates that haven't been constructed yet.
559

560
        Args:
561
            step_num: The built step number element
562
            parts_list: The built parts list element (if any)
563
            result: Classification result containing diagram candidates
564

565
        Returns:
566
            The built Diagram element, or None if no suitable diagram found
567
        """
568
        # Get all non-failed, non-constructed diagram candidates
569
        diagram_candidates = result.get_scored_candidates(
×
570
            "diagram", valid_only=False, exclude_failed=True
571
        )
572

573
        # Filter to only candidates that haven't been built yet
574
        # (subassemblies and rotation symbols built earlier may have claimed some)
575
        available_candidates = [c for c in diagram_candidates if c.constructed is None]
×
576

577
        if not available_candidates:
×
578
            log.debug(
×
579
                "[step] No diagram candidates available for step %d",
580
                step_num.value,
581
            )
582
            return None
×
583

584
        # Score each candidate based on position relative to step
585
        step_bbox = step_num.bbox
×
586
        best_candidate = None
×
587
        best_score = -float("inf")
×
588

589
        for candidate in available_candidates:
×
590
            score = self._score_step_diagram_pair(step_bbox, candidate.bbox)
×
591

592
            log.debug(
×
593
                "[step] Diagram candidate at %s for step %d: score=%.2f",
594
                candidate.bbox,
595
                step_num.value,
596
                score,
597
            )
598

599
            if score > best_score:
×
600
                best_score = score
×
601
                best_candidate = candidate
×
602

603
        if best_candidate is None or best_score < 0.2:
×
604
            log.debug(
×
605
                "[step] No suitable diagram found for step %d (best_score=%.2f)",
606
                step_num.value,
607
                best_score,
608
            )
609
            return None
×
610

611
        # Build the diagram
612
        try:
×
613
            diagram_elem = result.build(best_candidate)
×
614
            assert isinstance(diagram_elem, Diagram)
×
615
            log.debug(
×
616
                "[step] Built diagram at %s for step %d (score=%.2f)",
617
                diagram_elem.bbox,
618
                step_num.value,
619
                best_score,
620
            )
621
            return diagram_elem
×
622
        except CandidateFailedError as e:
×
623
            log.debug(
×
624
                "[step] Failed to build diagram for step %d: %s",
625
                step_num.value,
626
                e,
627
            )
628
            return None
×
629

630
    def _create_step_candidate(
1✔
631
        self,
632
        step_candidate: Candidate,
633
        parts_list_candidate: Candidate | None,
634
        result: ClassificationResult,
635
    ) -> Candidate | None:
636
        """Create a Step candidate (without diagram assignment).
637

638
        Diagrams are found at build time, not during scoring, to allow
639
        rotation symbols to claim small images first.
640

641
        Args:
642
            step_candidate: The StepNumber candidate for this step
643
            parts_list_candidate: The PartsList candidate to pair with (or None)
644
            result: Classification result
645

646
        Returns:
647
            The created Candidate with score but no construction, or None if
648
            the step number value cannot be extracted.
649
        """
650
        ABOVE_EPS = 2.0  # Small epsilon for "above" check
1✔
651
        ALIGNMENT_THRESHOLD_MULTIPLIER = 1.0  # Max horizontal offset
1✔
652
        DISTANCE_THRESHOLD_MULTIPLIER = 1.0  # Max vertical distance
1✔
653

654
        # Extract step number value from the candidate
655
        if not step_candidate.source_blocks:
1✔
656
            return None
×
657
        source_block = step_candidate.source_blocks[0]
1✔
658
        if not isinstance(source_block, Text):
1✔
659
            return None
×
660
        step_value = extract_step_number_value(source_block.text)
1✔
661
        if step_value is None:
1✔
662
            return None
×
663

664
        step_bbox = step_candidate.bbox
1✔
665
        parts_list_bbox = parts_list_candidate.bbox if parts_list_candidate else None
1✔
666

667
        # Calculate pairing scores if there's a parts_list above the step
668
        proximity_score = 0.0
1✔
669
        alignment_score = 0.0
1✔
670

671
        if (
1✔
672
            parts_list_bbox is not None
673
            and parts_list_bbox.y1 <= step_bbox.y0 + ABOVE_EPS
674
        ):
675
            # Calculate distance (how far apart vertically)
676
            distance = step_bbox.y0 - parts_list_bbox.y1
1✔
677

678
            # Calculate proximity score
679
            max_distance = step_bbox.height * DISTANCE_THRESHOLD_MULTIPLIER
1✔
680
            if max_distance > 0:
1✔
681
                proximity_score = max(0.0, 1.0 - (distance / max_distance))
1✔
682

683
            # Calculate alignment score (how well left edges align)
684
            max_alignment_diff = step_bbox.width * ALIGNMENT_THRESHOLD_MULTIPLIER
1✔
685
            left_diff = abs(parts_list_bbox.x0 - step_bbox.x0)
1✔
686
            if max_alignment_diff > 0:
1✔
687
                alignment_score = max(0.0, 1.0 - (left_diff / max_alignment_diff))
1✔
688

689
        # Create score object with candidate references
690
        # Diagrams are found at build time, not during scoring
691
        score = _StepScore(
1✔
692
            step_value=step_value,
693
            step_number_candidate=step_candidate,
694
            parts_list_candidate=parts_list_candidate,
695
            has_parts_list=parts_list_candidate is not None,
696
            step_proximity_score=proximity_score,
697
            step_alignment_score=alignment_score,
698
        )
699

700
        # Calculate combined bbox for the candidate (without diagram)
701
        combined_bbox = step_bbox
1✔
702
        if parts_list_bbox:
1✔
703
            combined_bbox = BBox.union(combined_bbox, parts_list_bbox)
1✔
704

705
        # Create candidate
706
        return Candidate(
1✔
707
            bbox=combined_bbox,
708
            label="step",
709
            score=score.overall_score(),
710
            score_details=score,
711
            source_blocks=[],
712
        )
713

714
    def _score_step_diagram_pair(
1✔
715
        self,
716
        step_bbox: BBox,
717
        diagram_bbox: BBox,
718
    ) -> float:
719
        """Score how well a diagram matches a step.
720

721
        Diagrams are typically positioned to the right of and/or below the step
722
        number. This method scores based on:
723
        - Horizontal position: prefer diagrams to the right, penalize left
724
        - Vertical position: prefer diagrams below the step header
725
        - Distance: closer is better
726

727
        Args:
728
            step_bbox: The step number bounding box
729
            diagram_bbox: The diagram bounding box to score
730

731
        Returns:
732
            Score between 0.0 and 1.0 (higher is better match)
733
        """
734
        # Reference point: bottom-right of step number
735
        ref_x = step_bbox.x1
×
736
        ref_y = step_bbox.y1
×
737

738
        # TODO Move all these constants into config, or make them adaptive?
739

740
        # Horizontal score
741
        # Diagrams to the right are preferred, but allow some overlap
742
        x_offset = diagram_bbox.x0 - ref_x
×
743

744
        if x_offset >= -50:
×
745
            # Diagram starts to the right or slightly overlapping - good
746
            # Score decreases slightly with distance to the right
747
            x_score = max(0.5, 1.0 - abs(x_offset) / 400.0)
×
748
        elif x_offset >= -200:
×
749
            # Diagram is moderately to the left - acceptable
750
            x_score = 0.3 + 0.2 * (1.0 + x_offset / 200.0)
×
751
        else:
752
            # Diagram is far to the left - poor match
753
            x_score = max(0.1, 0.3 + x_offset / 400.0)
×
754

755
        # Vertical score
756
        # Diagrams below the step header are preferred
757
        y_offset = diagram_bbox.y0 - ref_y
×
758

759
        if y_offset >= -30:
×
760
            # Diagram starts below or slightly overlapping - good
761
            # Score decreases with vertical distance
762
            y_score = max(0.3, 1.0 - abs(y_offset) / 300.0)
×
763
        elif y_offset >= -100:
×
764
            # Diagram is moderately above - less good but acceptable
765
            y_score = 0.2 + 0.3 * (1.0 + y_offset / 100.0)
×
766
        else:
767
            # Diagram is far above - poor match
768
            y_score = max(0.05, 0.2 + y_offset / 300.0)
×
769

770
        # Combined score - weight both dimensions equally
771
        score = 0.5 * x_score + 0.5 * y_score
×
772

773
        return score
×
774

775
    def _get_rotation_symbol_for_step(
1✔
776
        self,
777
        step_bbox: BBox,
778
        result: ClassificationResult,
779
    ) -> RotationSymbol | None:
780
        """Find an already-built rotation symbol within this step's area.
781

782
        Rotation symbols are built by PageClassifier before steps are processed.
783
        This method finds the already-built rotation symbol that falls within
784
        the step's bounding box.
785

786
        Args:
787
            step_bbox: The step's bounding box (including step_num, parts_list,
788
                and diagram)
789
            result: Classification result containing rotation symbol candidates
790

791
        Returns:
792
            Single RotationSymbol element for this step, or None if not found
793
        """
794
        # Get rotation_symbol candidates - they should already be built
795
        # by PageClassifier. Use valid_only=True to only get successfully
796
        # constructed rotation symbols.
797
        rotation_symbol_candidates = result.get_scored_candidates(
×
798
            "rotation_symbol", valid_only=True
799
        )
800

801
        log.debug(
×
802
            "[step] Looking for rotation symbols in step bbox %s, "
803
            "found %d built candidates",
804
            step_bbox,
805
            len(rotation_symbol_candidates),
806
        )
807

808
        if not rotation_symbol_candidates:
×
809
            return None
×
810

811
        # Find best-scoring rotation symbol overlapping the step's bbox
812
        overlapping_candidates = filter_overlapping(
×
813
            rotation_symbol_candidates, step_bbox
814
        )
815
        best_candidate = find_best_scoring(overlapping_candidates)
×
816

817
        if best_candidate and best_candidate.constructed is not None:
×
818
            rotation_symbol = best_candidate.constructed
×
819
            assert isinstance(rotation_symbol, RotationSymbol)
×
820
            log.debug(
×
821
                "[step] Found rotation symbol at %s (score=%.2f)",
822
                rotation_symbol.bbox,
823
                best_candidate.score,
824
            )
825
            return rotation_symbol
×
826

827
        log.debug("[step] No rotation symbol found in step bbox %s", step_bbox)
×
828
        return None
×
829

830
    def _get_arrows_for_step(
1✔
831
        self,
832
        step_num: StepNumber,
833
        diagram: Diagram | None,
834
        result: ClassificationResult,
835
    ) -> list[Arrow]:
836
        """Collect already-built arrows that belong to this step.
837

838
        Arrows are built in Phase 3 of build_all(), before subassemblies.
839
        This method finds arrows that are positioned near the step's diagram
840
        and assigns them to this step.
841

842
        Typically these are arrows pointing from subassembly callout boxes
843
        to the main diagram.
844

845
        Args:
846
            step_num: The step number element
847
            diagram: The diagram element (if any)
848
            result: Classification result containing arrow candidates
849

850
        Returns:
851
            List of Arrow elements for this step
852
        """
853
        # Get already-built arrows (built in Phase 3)
854
        arrow_candidates = result.get_scored_candidates(
1✔
855
            "arrow",
856
            valid_only=True,  # Only get successfully built arrows
857
        )
858

859
        log.debug(
1✔
860
            "[step] Looking for arrows for step %d, found %d built arrows",
861
            step_num.value,
862
            len(arrow_candidates),
863
        )
864

865
        if not arrow_candidates:
1✔
866
            return []
1✔
867

868
        # Determine search region: prefer diagram area, fallback to step area
869
        search_bbox = diagram.bbox if diagram else step_num.bbox
1✔
870

871
        # Expand search region to catch arrows near the diagram
872
        # Use a larger margin than rotation symbols since arrows can extend further
873
        search_region = search_bbox.expand(100.0)
1✔
874

875
        log.debug(
1✔
876
            "[step] Arrow search region for step %d: %s",
877
            step_num.value,
878
            search_region,
879
        )
880

881
        # Find arrows within or overlapping the search region
882
        arrows: list[Arrow] = []
1✔
883
        overlapping_candidates = filter_overlapping(arrow_candidates, search_region)
1✔
884

885
        for candidate in overlapping_candidates:
1✔
886
            assert candidate.constructed is not None
1✔
887
            assert isinstance(candidate.constructed, Arrow)
1✔
888
            log.debug(
1✔
889
                "[step]   Arrow at %s overlaps search region (score=%.2f)",
890
                candidate.bbox,
891
                candidate.score,
892
            )
893
            arrows.append(candidate.constructed)
1✔
894

895
        log.debug(
1✔
896
            "[step] Found %d arrows for step %d",
897
            len(arrows),
898
            step_num.value,
899
        )
900
        return arrows
1✔
901

902
    def _get_subassemblies_for_step(
1✔
903
        self,
904
        step_num: StepNumber,
905
        diagram: Diagram | None,
906
        result: ClassificationResult,
907
    ) -> list[SubAssembly]:
908
        """Get already-built subassemblies that belong to this step.
909

910
        Subassemblies are built in build_all() before steps. This method finds
911
        subassemblies that are positioned near this step's diagram and haven't
912
        been claimed by another step yet.
913

914
        Args:
915
            step_num: The step number element
916
            diagram: The diagram element (if any)
917
            result: Classification result containing subassembly candidates
918

919
        Returns:
920
            List of SubAssembly elements for this step
921
        """
922
        # Get all built subassemblies
923
        all_subassemblies: list[SubAssembly] = []
×
924
        for sa_candidate in result.get_scored_candidates(
×
925
            "subassembly", valid_only=True
926
        ):
927
            assert sa_candidate.constructed is not None
×
928
            assert isinstance(sa_candidate.constructed, SubAssembly)
×
929
            all_subassemblies.append(sa_candidate.constructed)
×
930

931
        log.debug(
×
932
            "[step] Looking for subassemblies for step %d, found %d built",
933
            step_num.value,
934
            len(all_subassemblies),
935
        )
936

937
        if not all_subassemblies:
×
938
            return []
×
939

940
        # Determine search region based on step_number and diagram
941
        reference_bbox = diagram.bbox.union(step_num.bbox) if diagram else step_num.bbox
×
942

943
        page_bbox = result.page_data.bbox
×
944

945
        # Expand search region around the reference area
946
        # Horizontally: search the full page width
947
        # Vertically: search a region around the reference bbox
948
        vertical_margin = reference_bbox.height
×
949
        search_region = BBox(
×
950
            x0=page_bbox.x0,
951
            y0=max(page_bbox.y0, reference_bbox.y0 - vertical_margin),
952
            x1=page_bbox.x1,
953
            y1=min(page_bbox.y1, reference_bbox.y1 + vertical_margin),
954
        )
955

956
        log.debug(
×
957
            "[step] SubAssembly search region for step %d: %s",
958
            step_num.value,
959
            search_region,
960
        )
961

962
        # Find subassemblies that overlap the search region
963
        subassemblies: list[SubAssembly] = []
×
964
        for subassembly in all_subassemblies:
×
965
            if subassembly.bbox.overlaps(search_region):
×
966
                log.debug(
×
967
                    "[step]   SubAssembly at %s overlaps search region",
968
                    subassembly.bbox,
969
                )
970
                subassemblies.append(subassembly)
×
971

972
        log.debug(
×
973
            "[step] Found %d subassemblies for step %d",
974
            len(subassemblies),
975
            step_num.value,
976
        )
977
        return subassemblies
×
978

979
    def _compute_step_bbox(
1✔
980
        self,
981
        step_num: StepNumber,
982
        parts_list: PartsList | None,
983
        diagram: Diagram | None,
984
        subassemblies: list[SubAssembly],
985
        page_bbox: BBox,
986
    ) -> BBox:
987
        """Compute the overall bounding box for the Step.
988

989
        This encompasses the step number, parts list (if any), diagram (if any),
990
        and all subassemblies.
991
        The result is clipped to the page bounds to handle elements that extend
992
        slightly off-page (e.g., arrows in diagrams).
993

994
        Args:
995
            step_num: The step number element
996
            parts_list: The parts list (if any)
997
            diagram: The diagram element (if any)
998
            subassemblies: List of subassemblies for this step
999
            page_bbox: The page bounding box to clip to
1000

1001
        Returns:
1002
            Combined bounding box, clipped to page bounds
1003
        """
1004
        bboxes = [step_num.bbox]
1✔
1005
        if parts_list:
1✔
1006
            bboxes.append(parts_list.bbox)
1✔
1007
        if diagram:
1✔
1008
            bboxes.append(diagram.bbox)
1✔
1009
        for sa in subassemblies:
1✔
1010
            bboxes.append(sa.bbox)
1✔
1011

1012
        return BBox.union_all(bboxes).clip_to(page_bbox)
1✔
1013

1014

1015
def assign_diagrams_to_steps(
1✔
1016
    steps: list[Step],
1017
    diagrams: list[Diagram],
1018
    max_distance: float = 500.0,
1019
) -> None:
1020
    """Assign diagrams to steps using Hungarian algorithm.
1021

1022
    Uses optimal bipartite matching to pair diagrams with steps based on
1023
    a scoring function that considers:
1024
    - Distance from step_number to diagram (closer is better)
1025
    - Relative position (diagram typically below/right of step_number)
1026

1027
    This function mutates the Step objects in place, setting their diagram
1028
    attribute.
1029

1030
    Args:
1031
        steps: List of Step objects to assign diagrams to
1032
        diagrams: List of Diagram objects to assign
1033
        max_distance: Maximum distance for a valid assignment. Pairs with distance
1034
            greater than this will not be matched.
1035
    """
1036
    if not steps or not diagrams:
1✔
1037
        log.debug(
1✔
1038
            "[step] No diagram assignment needed: steps=%d, diagrams=%d",
1039
            len(steps),
1040
            len(diagrams),
1041
        )
1042
        return
1✔
1043

1044
    n_steps = len(steps)
1✔
1045
    n_diagrams = len(diagrams)
1✔
1046

1047
    log.debug(
1✔
1048
        "[step] Running Hungarian matching for diagrams: %d steps, %d diagrams",
1049
        n_steps,
1050
        n_diagrams,
1051
    )
1052

1053
    # Build cost matrix: rows = diagrams, cols = steps
1054
    # Lower cost = better match
1055
    cost_matrix = np.zeros((n_diagrams, n_steps))
1✔
1056

1057
    for i, diag in enumerate(diagrams):
1✔
1058
        diag_center = diag.bbox.center
1✔
1059
        for j, step in enumerate(steps):
1✔
1060
            step_num_center = step.step_number.bbox.center
1✔
1061

1062
            # Base cost is distance from step_number to diagram center
1063
            distance = (
1✔
1064
                (step_num_center[0] - diag_center[0]) ** 2
1065
                + (step_num_center[1] - diag_center[1]) ** 2
1066
            ) ** 0.5
1067

1068
            # Apply position penalty: prefer diagrams that are below or to the
1069
            # right of the step_number (typical LEGO instruction layout)
1070
            # If diagram is above the step_number, add penalty
1071
            if diag_center[1] < step_num_center[1] - 50:  # Diagram above step
1✔
1072
                distance *= 1.5  # Penalty for being above
1✔
1073

1074
            cost_matrix[i, j] = distance
1✔
1075
            log.debug(
1✔
1076
                "[step]   Cost diagram[%d] at %s to step[%d]: %.1f",
1077
                i,
1078
                diag.bbox,
1079
                step.step_number.value,
1080
                distance,
1081
            )
1082

1083
    # Apply max_distance threshold
1084
    high_cost = max_distance * 10
1✔
1085
    cost_matrix_thresholded = np.where(
1✔
1086
        cost_matrix > max_distance, high_cost, cost_matrix
1087
    )
1088

1089
    # Run Hungarian algorithm
1090
    row_indices, col_indices = linear_sum_assignment(cost_matrix_thresholded)
1✔
1091

1092
    # Assign diagrams to steps based on the matching
1093
    for row_idx, col_idx in zip(row_indices, col_indices, strict=True):
1✔
1094
        if cost_matrix[row_idx, col_idx] <= max_distance:
1✔
1095
            diag = diagrams[row_idx]
1✔
1096
            step = steps[col_idx]
1✔
1097
            # Use object.__setattr__ because Step is a frozen Pydantic model
1098
            object.__setattr__(step, "diagram", diag)
1✔
1099
            log.debug(
1✔
1100
                "[step] Assigned diagram at %s to step %d (cost=%.1f)",
1101
                diag.bbox,
1102
                step.step_number.value,
1103
                cost_matrix[row_idx, col_idx],
1104
            )
1105
        else:
1106
            log.debug(
×
1107
                "[step] Skipped diagram assignment: diagram[%d] to step[%d] "
1108
                "cost=%.1f > max_distance=%.1f",
1109
                row_idx,
1110
                col_idx,
1111
                cost_matrix[row_idx, col_idx],
1112
                max_distance,
1113
            )
1114

1115

1116
def _has_divider_between(
1✔
1117
    bbox1: BBox,
1118
    bbox2: BBox,
1119
    dividers: list[Divider],
1120
) -> bool:
1121
    """Check if there is a divider line between two bboxes.
1122

1123
    A divider is considered "between" if it crosses the line connecting
1124
    the centers of the two bboxes.
1125

1126
    Args:
1127
        bbox1: First bounding box
1128
        bbox2: Second bounding box
1129
        dividers: List of dividers to check
1130

1131
    Returns:
1132
        True if a divider is between the two bboxes
1133
    """
1134
    center1 = bbox1.center
1✔
1135
    center2 = bbox2.center
1✔
1136

1137
    for divider in dividers:
1✔
1138
        div_bbox = divider.bbox
1✔
1139

1140
        # Check if divider is vertical (separates left/right)
1141
        if div_bbox.width < div_bbox.height * 0.2:  # Vertical line
1✔
1142
            # Check if divider x is between the two centers
1143
            min_x = min(center1[0], center2[0])
1✔
1144
            max_x = max(center1[0], center2[0])
1✔
1145
            div_x = div_bbox.center[0]
1✔
1146
            if min_x < div_x < max_x:
1✔
1147
                # Check if divider spans the y range
1148
                min_y = min(center1[1], center2[1])
1✔
1149
                max_y = max(center1[1], center2[1])
1✔
1150
                if div_bbox.y0 <= max_y and div_bbox.y1 >= min_y:
1✔
1151
                    return True
1✔
1152

1153
        # Check if divider is horizontal (separates top/bottom)
1154
        elif div_bbox.height < div_bbox.width * 0.2:  # Horizontal line
×
1155
            # Check if divider y is between the two centers
1156
            min_y = min(center1[1], center2[1])
×
1157
            max_y = max(center1[1], center2[1])
×
1158
            div_y = div_bbox.center[1]
×
1159
            if min_y < div_y < max_y:
×
1160
                # Check if divider spans the x range
1161
                min_x = min(center1[0], center2[0])
×
1162
                max_x = max(center1[0], center2[0])
×
1163
                if div_bbox.x0 <= max_x and div_bbox.x1 >= min_x:
×
1164
                    return True
×
1165

1166
    return False
1✔
1167

1168

1169
def assign_subassemblies_to_steps(
1✔
1170
    steps: list[Step],
1171
    subassemblies: list[SubAssembly],
1172
    dividers: list[Divider],
1173
    max_distance: float = 400.0,
1174
) -> None:
1175
    """Assign subassemblies to steps using Hungarian algorithm.
1176

1177
    Uses optimal bipartite matching to pair subassemblies with steps based on:
1178
    - Distance from step's diagram to subassembly (closer is better)
1179
    - No divider between the subassembly and the step's diagram
1180

1181
    This function mutates the Step objects in place, adding to their
1182
    subassemblies list.
1183

1184
    Args:
1185
        steps: List of Step objects to assign subassemblies to
1186
        subassemblies: List of SubAssembly objects to assign
1187
        dividers: List of Divider objects for obstruction checking
1188
        max_distance: Maximum distance for a valid assignment
1189
    """
1190
    if not steps or not subassemblies:
1✔
1191
        log.debug(
1✔
1192
            "[step] No subassembly assignment needed: steps=%d, subassemblies=%d",
1193
            len(steps),
1194
            len(subassemblies),
1195
        )
1196
        return
1✔
1197

1198
    n_steps = len(steps)
1✔
1199
    n_subassemblies = len(subassemblies)
1✔
1200

1201
    log.debug(
1✔
1202
        "[step] Running Hungarian matching for subassemblies: "
1203
        "%d steps, %d subassemblies",
1204
        n_steps,
1205
        n_subassemblies,
1206
    )
1207

1208
    # Build cost matrix: rows = subassemblies, cols = steps
1209
    cost_matrix = np.zeros((n_subassemblies, n_steps))
1✔
1210
    high_cost = max_distance * 10
1✔
1211

1212
    for i, sa in enumerate(subassemblies):
1✔
1213
        sa_center = sa.bbox.center
1✔
1214
        for j, step in enumerate(steps):
1✔
1215
            # Use diagram center if available, otherwise step bbox center
1216
            if step.diagram:
1✔
1217
                target_bbox = step.diagram.bbox
1✔
1218
                target_center = target_bbox.center
1✔
1219
            else:
1220
                target_bbox = step.bbox
×
1221
                target_center = step.step_number.bbox.center
×
1222

1223
            # Base cost is distance from diagram/step to subassembly center
1224
            distance = (
1✔
1225
                (target_center[0] - sa_center[0]) ** 2
1226
                + (target_center[1] - sa_center[1]) ** 2
1227
            ) ** 0.5
1228

1229
            # Check for divider between subassembly and step's diagram
1230
            if _has_divider_between(sa.bbox, target_bbox, dividers):
1✔
1231
                # High cost if there's a divider between
1232
                distance = high_cost
1✔
1233
                log.debug(
1✔
1234
                    "[step]   Divider between subassembly[%d] at %s and step[%d]",
1235
                    i,
1236
                    sa.bbox,
1237
                    step.step_number.value,
1238
                )
1239

1240
            cost_matrix[i, j] = distance
1✔
1241
            log.debug(
1✔
1242
                "[step]   Cost subassembly[%d] at %s to step[%d]: %.1f",
1243
                i,
1244
                sa.bbox,
1245
                step.step_number.value,
1246
                distance,
1247
            )
1248

1249
    # Apply max_distance threshold
1250
    cost_matrix_thresholded = np.where(
1✔
1251
        cost_matrix > max_distance, high_cost, cost_matrix
1252
    )
1253

1254
    # Run Hungarian algorithm
1255
    row_indices, col_indices = linear_sum_assignment(cost_matrix_thresholded)
1✔
1256

1257
    # Assign subassemblies to steps based on the matching
1258
    # Note: Unlike diagrams, multiple subassemblies can be assigned to one step
1259
    # The Hungarian algorithm gives us the best 1:1 matching, but we may want
1260
    # to assign additional nearby subassemblies too
1261

1262
    # First, do the 1:1 optimal assignment
1263
    assigned_subassemblies: set[int] = set()
1✔
1264
    for row_idx, col_idx in zip(row_indices, col_indices, strict=True):
1✔
1265
        if cost_matrix[row_idx, col_idx] <= max_distance:
1✔
1266
            sa = subassemblies[row_idx]
1✔
1267
            step = steps[col_idx]
1✔
1268
            # Add to step's subassemblies list
1269
            new_subassemblies = list(step.subassemblies) + [sa]
1✔
1270
            object.__setattr__(step, "subassemblies", new_subassemblies)
1✔
1271
            assigned_subassemblies.add(row_idx)
1✔
1272
            log.debug(
1✔
1273
                "[step] Assigned subassembly at %s to step %d (cost=%.1f)",
1274
                sa.bbox,
1275
                step.step_number.value,
1276
                cost_matrix[row_idx, col_idx],
1277
            )
1278

1279
    # Then, try to assign remaining unassigned subassemblies to their nearest step
1280
    # (if within max_distance and no divider between)
1281
    for i, sa in enumerate(subassemblies):
1✔
1282
        if i in assigned_subassemblies:
1✔
1283
            continue
1✔
1284

1285
        # Find the step with lowest cost for this subassembly
1286
        best_step_idx = None
1✔
1287
        best_cost = high_cost
1✔
1288
        for j in range(len(steps)):
1✔
1289
            if cost_matrix[i, j] < best_cost:
1✔
1290
                best_cost = cost_matrix[i, j]
1✔
1291
                best_step_idx = j
1✔
1292

1293
        if best_step_idx is not None and best_cost <= max_distance:
1✔
1294
            step = steps[best_step_idx]
1✔
1295
            new_subassemblies = list(step.subassemblies) + [sa]
1✔
1296
            object.__setattr__(step, "subassemblies", new_subassemblies)
1✔
1297
            log.debug(
1✔
1298
                "[step] Assigned extra subassembly at %s to step %d (cost=%.1f)",
1299
                sa.bbox,
1300
                step.step_number.value,
1301
                best_cost,
1302
            )
1303

1304

1305
def assign_rotation_symbols_to_steps(
1✔
1306
    steps: list[Step],
1307
    rotation_symbols: list[RotationSymbol],
1308
    max_distance: float = 300.0,
1309
) -> None:
1310
    """Assign rotation symbols to steps using Hungarian algorithm.
1311

1312
    Uses optimal bipartite matching to pair rotation symbols with steps based on
1313
    minimum distance from the rotation symbol to the step's diagram (or step bbox
1314
    center if no diagram).
1315

1316
    This function mutates the Step objects in place, setting their rotation_symbol
1317
    attribute.
1318

1319
    Args:
1320
        steps: List of Step objects to assign rotation symbols to
1321
        rotation_symbols: List of RotationSymbol objects to assign
1322
        max_distance: Maximum distance for a valid assignment. Pairs with distance
1323
            greater than this will not be matched.
1324
    """
1325
    if not steps or not rotation_symbols:
1✔
1326
        log.debug(
1✔
1327
            "[step] No rotation symbol assignment needed: "
1328
            "steps=%d, rotation_symbols=%d",
1329
            len(steps),
1330
            len(rotation_symbols),
1331
        )
1332
        return
1✔
1333

1334
    n_steps = len(steps)
1✔
1335
    n_symbols = len(rotation_symbols)
1✔
1336

1337
    log.debug(
1✔
1338
        "[step] Running Hungarian matching: %d steps, %d rotation symbols",
1339
        n_steps,
1340
        n_symbols,
1341
    )
1342

1343
    # Build cost matrix: rows = rotation symbols, cols = steps
1344
    # Cost = distance from rotation symbol center to step's diagram center
1345
    # (or step bbox center if no diagram)
1346
    cost_matrix = np.zeros((n_symbols, n_steps))
1✔
1347

1348
    for i, rs in enumerate(rotation_symbols):
1✔
1349
        rs_center = rs.bbox.center
1✔
1350
        for j, step in enumerate(steps):
1✔
1351
            # Use diagram center if available, otherwise step bbox center
1352
            if step.diagram:
1✔
1353
                target_center = step.diagram.bbox.center
1✔
1354
            else:
1355
                target_center = step.bbox.center
×
1356

1357
            # Euclidean distance
1358
            distance = (
1✔
1359
                (rs_center[0] - target_center[0]) ** 2
1360
                + (rs_center[1] - target_center[1]) ** 2
1361
            ) ** 0.5
1362

1363
            cost_matrix[i, j] = distance
1✔
1364
            log.debug(
1✔
1365
                "[step]   Distance from rotation_symbol[%d] at %s to step[%d]: %.1f",
1366
                i,
1367
                rs.bbox,
1368
                step.step_number.value,
1369
                distance,
1370
            )
1371

1372
    # Apply max_distance threshold by setting costs above threshold to a high value
1373
    # This prevents assignments that are too far apart
1374
    high_cost = max_distance * 10  # Arbitrary large value
1✔
1375
    cost_matrix_thresholded = np.where(
1✔
1376
        cost_matrix > max_distance, high_cost, cost_matrix
1377
    )
1378

1379
    # Run Hungarian algorithm
1380
    row_indices, col_indices = linear_sum_assignment(cost_matrix_thresholded)
1✔
1381

1382
    # Assign rotation symbols to steps based on the matching
1383
    for row_idx, col_idx in zip(row_indices, col_indices, strict=True):
1✔
1384
        # Check if this assignment is within the max_distance threshold
1385
        if cost_matrix[row_idx, col_idx] <= max_distance:
1✔
1386
            rs = rotation_symbols[row_idx]
1✔
1387
            step = steps[col_idx]
1✔
1388
            step.rotation_symbol = rs
1✔
1389
            # Expand step bbox to include the rotation symbol
1390
            step.bbox = step.bbox.union(rs.bbox)
1✔
1391
            log.debug(
1✔
1392
                "[step] Assigned rotation symbol at %s to step %d "
1393
                "(distance=%.1f, new step bbox=%s)",
1394
                rs.bbox,
1395
                step.step_number.value,
1396
                cost_matrix[row_idx, col_idx],
1397
                step.bbox,
1398
            )
1399
        else:
1400
            log.debug(
×
1401
                "[step] Skipped assignment: rotation_symbol[%d] to step[%d] "
1402
                "distance=%.1f > max_distance=%.1f",
1403
                row_idx,
1404
                col_indices[row_idx] if row_idx < len(col_indices) else -1,
1405
                cost_matrix[row_idx, col_idx],
1406
                max_distance,
1407
            )
1408

1409

1410
def filter_subassembly_values[T](
1✔
1411
    items: list[tuple[int, T]],
1412
    *,
1413
    min_gap: int = 3,
1414
    max_subassembly_start: int = 3,
1415
) -> list[tuple[int, T]]:
1416
    """Filter items to exclude likely subassembly values.
1417

1418
    Subassembly steps (e.g., step 1, 2 inside a subassembly box) often appear
1419
    alongside higher-numbered page-level steps (e.g., 15, 16). This creates
1420
    sequences like [1, 2, 15, 16] which cannot all be page-level steps.
1421

1422
    This function identifies such cases by detecting a significant gap in values
1423
    and filtering out the lower-numbered items that are likely subassembly steps.
1424

1425
    Args:
1426
        items: List of (value, item) tuples where value is the step number
1427
            and item is any associated data (e.g., a Candidate).
1428
        min_gap: Minimum gap size to trigger filtering (exclusive).
1429
            Default is 3, so gaps > 3 trigger filtering.
1430
        max_subassembly_start: Maximum starting value for subassembly detection.
1431
            If min value is <= this, filtering is applied. Default is 3.
1432

1433
    Returns:
1434
        Filtered list of (value, item) tuples, keeping only the higher values
1435
        if a subassembly pattern is detected. Returns original list if no
1436
        filtering is needed.
1437

1438
    Examples:
1439
        >>> filter_subassembly_values([(1, "a"), (2, "b"), (15, "c"), (16, "d")])
1440
        [(15, "c"), (16, "d")]
1441

1442
        >>> filter_subassembly_values([(5, "a"), (6, "b"), (15, "c")])
1443
        [(5, "a"), (6, "b"), (15, "c")]  # min=5 > 3, no filtering
1444
    """
1445
    if len(items) <= 1:
1✔
1446
        return items
1✔
1447

1448
    # Sort by value
1449
    sorted_items = sorted(items, key=lambda x: x[0])
1✔
1450
    values = [v for v, _ in sorted_items]
1✔
1451

1452
    # Find the largest gap between consecutive values
1453
    max_gap_size = 0
1✔
1454
    max_gap_index = -1
1✔
1455
    for i in range(len(values) - 1):
1✔
1456
        gap = values[i + 1] - values[i]
1✔
1457
        if gap > max_gap_size:
1✔
1458
            max_gap_size = gap
1✔
1459
            max_gap_index = i
1✔
1460

1461
    # Check if filtering should occur:
1462
    # 1. Gap must be larger than min_gap
1463
    # 2. The minimum value must be <= max_subassembly_start
1464
    if max_gap_size > min_gap and max_gap_index >= 0:
1✔
1465
        min_value = values[0]
1✔
1466
        if min_value <= max_subassembly_start:
1✔
1467
            threshold = values[max_gap_index + 1]
1✔
1468
            return [(v, item) for v, item in sorted_items if v >= threshold]
1✔
1469

1470
    return items
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc