• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 20258147097

16 Dec 2025 05:59AM UTC coverage: 89.094% (+0.4%) from 88.668%
20258147097

push

github

bramp
chore: Minor cleanups

- Add TODO comment about BatchClassificationResult naming
- Remove completed testing improvements from TODO.md

12818 of 14387 relevant lines covered (89.09%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

80.43
/src/build_a_long/pdf_extract/classifier/steps/step_classifier.py
1
"""
2
Step classifier.
3

4
Purpose
5
-------
6
Identify complete Step structures by combining step_number, parts_list, and diagram
7
elements. A Step represents a single building instruction comprising:
8
- A StepNumber label
9
- An optional PartsList (the parts needed for this step)
10
- A Diagram (the main instruction graphic showing what to build)
11

12
We look for step_numbers and attempt to pair them with nearby parts_lists and
13
identify the appropriate diagram region for each step.
14

15
Debugging
16
---------
17
Set environment variables to aid investigation without code changes:
18

19
- LOG_LEVEL=DEBUG
20
    Enables DEBUG-level logging (if not already configured by caller).
21
"""
22

23
import logging
1✔
24
from collections.abc import Callable, Sequence
1✔
25

26
import numpy as np
1✔
27
from scipy.optimize import linear_sum_assignment
1✔
28

29
from build_a_long.pdf_extract.classifier.candidate import Candidate
1✔
30
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
31
    CandidateFailedError,
32
    ClassificationResult,
33
)
34
from build_a_long.pdf_extract.classifier.label_classifier import (
1✔
35
    LabelClassifier,
36
)
37
from build_a_long.pdf_extract.classifier.score import (
1✔
38
    Score,
39
    Weight,
40
    find_best_scoring,
41
)
42
from build_a_long.pdf_extract.classifier.steps.pairing import (
1✔
43
    PairingConfig,
44
    find_optimal_pairings,
45
    has_divider_between,
46
)
47
from build_a_long.pdf_extract.classifier.text import (
1✔
48
    extract_step_number_value,
49
)
50
from build_a_long.pdf_extract.extractor.bbox import BBox, filter_overlapping
1✔
51
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
52
    Arrow,
53
    Diagram,
54
    Divider,
55
    LegoPageElements,
56
    PartsList,
57
    RotationSymbol,
58
    Step,
59
    StepNumber,
60
    SubAssembly,
61
    SubStep,
62
)
63
from build_a_long.pdf_extract.extractor.page_blocks import Text
1✔
64

65
log = logging.getLogger(__name__)
1✔
66

67

68
class _StepScore(Score):
1✔
69
    """Internal score representation for step classification."""
70

71
    step_value: int
72
    """The parsed step number value (e.g., 1, 2, 3)."""
1✔
73

74
    step_number_candidate: Candidate
75
    """The step number candidate this step is associated with."""
1✔
76

77
    parts_list_candidate: Candidate | None
78
    """The parts list candidate paired with this step (if any)."""
1✔
79

80
    has_parts_list: bool
81
    """Whether this step has an associated parts list."""
1✔
82

83
    step_proximity_score: float
84
    """Score based on proximity to the PartsList above (0.0-1.0).
1✔
85
    1.0 for closest proximity, 0.0 if very far. 0.0 if no parts list."""
86

87
    step_alignment_score: float
88
    """Score based on left-edge alignment with PartsList above (0.0-1.0).
1✔
89
    1.0 is perfect alignment, 0.0 is very misaligned. 0.0 if no parts list."""
90

91
    def score(self) -> Weight:
1✔
92
        """Return the overall pairing score."""
93
        return self.overall_score()
×
94

95
    def overall_score(self) -> float:
1✔
96
        """Calculate overall quality score based on parts list pairing.
97

98
        Steps with a parts_list are given a bonus to prefer them over
99
        steps without parts_list. Diagrams are found at build time, not
100
        during scoring, to allow rotation symbols to claim small images first.
101
        """
102
        if self.has_parts_list:
1✔
103
            # Base score for having parts_list + proximity/alignment bonus
104
            parts_list_bonus = 0.5
1✔
105
            pairing_score = (
1✔
106
                self.step_proximity_score + self.step_alignment_score
107
            ) / 2.0
108
            return parts_list_bonus + 0.5 * pairing_score
1✔
109
        return 0.3  # Lower base score for steps without parts list
1✔
110

111
    def sort_key(self) -> tuple[float, int]:
1✔
112
        """Return a tuple for sorting candidates.
113

114
        We prefer:
115
        1. Higher overall scores (better StepNumber-PartsList-Diagram match)
116
        2. Lower step number values (to break ties and maintain order)
117
        """
118
        return (-self.overall_score(), self.step_value)
1✔
119

120

121
class StepClassifier(LabelClassifier):
1✔
122
    """Classifier for complete Step structures."""
123

124
    output = "step"
1✔
125
    requires = frozenset(
1✔
126
        {
127
            "arrow",
128
            "step_number",
129
            "parts_list",
130
            "diagram",
131
            "rotation_symbol",
132
            "subassembly",
133
            "substep",
134
            "preview",
135
        }
136
    )
137

138
    def _score(self, result: ClassificationResult) -> None:
1✔
139
        """Score step pairings and create candidates."""
140
        page_data = result.page_data
1✔
141

142
        # Get step number and parts list candidates (not constructed elements)
143
        step_number_candidates = result.get_scored_candidates(
1✔
144
            "step_number", valid_only=False, exclude_failed=True
145
        )
146

147
        if not step_number_candidates:
1✔
148
            return
1✔
149

150
        # Get parts_list candidates
151
        parts_list_candidates = result.get_scored_candidates(
1✔
152
            "parts_list",
153
            valid_only=False,
154
            exclude_failed=True,
155
        )
156

157
        log.debug(
1✔
158
            "[step] page=%s step_candidates=%d parts_list_candidates=%d",
159
            page_data.page_number,
160
            len(step_number_candidates),
161
            len(parts_list_candidates),
162
        )
163

164
        # Create all possible Step candidates for pairings (without diagrams initially)
165
        # Deduplication happens at build time, not scoring time, so that
166
        # diagram assignment can be re-evaluated as blocks get claimed.
167
        all_candidates: list[Candidate] = []
1✔
168
        for step_candidate in step_number_candidates:
1✔
169
            # Create candidates for this StepNumber paired with each PartsList
170
            for parts_list_candidate in parts_list_candidates:
1✔
171
                candidate = self._create_step_candidate(
1✔
172
                    step_candidate, parts_list_candidate, result
173
                )
174
                if candidate:
1✔
175
                    all_candidates.append(candidate)
1✔
176

177
            # Also create a candidate with no PartsList (fallback)
178
            candidate = self._create_step_candidate(step_candidate, None, result)
1✔
179
            if candidate:
1✔
180
                all_candidates.append(candidate)
1✔
181

182
        # Add all candidates to result (deduplication happens at build time)
183
        for candidate in all_candidates:
1✔
184
            result.add_candidate(candidate)
1✔
185

186
        log.debug(
1✔
187
            "[step] Created %d step candidates",
188
            len(all_candidates),
189
        )
190

191
    def build_all(self, result: ClassificationResult) -> list[LegoPageElements]:
1✔
192
        """Build all Step elements with coordinated element assignment.
193

194
        Build Order (phases run sequentially):
195
        - Phase 1: Rotation symbols - claim small circular arrow graphics
196
        - Phase 2: Parts lists - claim part count blocks and part images
197
        - Phase 3: Arrows - claim arrowhead triangles and shaft lines/rectangles
198
        - Phase 4: Subassemblies & Previews - claim white callout boxes
199
        - Phase 5: Steps - build step_number + parts_list pairs (no diagram yet)
200
        - Phase 6: Diagrams - build remaining unclaimed diagram regions
201
        - Phase 7: Assign diagrams to steps via Hungarian matching
202
        - Phase 8: Assign subassemblies to steps via Hungarian matching
203
        - Phase 9: Finalize steps - add arrows, compute final bboxes
204
        - Phase 10: Assign rotation symbols to steps via Hungarian matching
205

206
        The order matters because elements compete for the same Drawing blocks.
207
        Earlier phases claim blocks first, preventing later phases from using them.
208
        For example:
209
        - Arrows must build before subassemblies so arrow shafts aren't consumed
210
          by subassembly white boxes
211
        - Rotation symbols must build before diagrams so small circular graphics
212
          aren't incorrectly clustered into diagrams
213

214
        Deduplication of steps happens at build time (not scoring time) so that
215
        diagram assignment can be re-evaluated as blocks get claimed.
216

217
        Returns:
218
            List of successfully constructed Step elements, sorted by step number
219
        """
220
        # Phase 1: Build all rotation symbols BEFORE steps.
221
        # This allows rotation symbols to claim their Drawing blocks first,
222
        # preventing them from being incorrectly clustered into diagrams.
223
        for rs_candidate in result.get_scored_candidates(
1✔
224
            "rotation_symbol", valid_only=False, exclude_failed=True
225
        ):
226
            try:
1✔
227
                result.build(rs_candidate)
1✔
228
                log.debug(
1✔
229
                    "[step] Built rotation symbol at %s (score=%.2f)",
230
                    rs_candidate.bbox,
231
                    rs_candidate.score,
232
                )
233
            except Exception as e:
×
234
                log.debug(
×
235
                    "[step] Failed to construct rotation_symbol candidate at %s: %s",
236
                    rs_candidate.bbox,
237
                    e,
238
                )
239

240
        # Phase 2: Build all parts lists BEFORE steps.
241
        # This allows parts lists to claim their Drawing blocks first,
242
        # preventing them from being claimed by subassemblies.
243
        for pl_candidate in result.get_scored_candidates(
1✔
244
            "parts_list", valid_only=False, exclude_failed=True
245
        ):
246
            try:
1✔
247
                result.build(pl_candidate)
1✔
248
                log.debug(
1✔
249
                    "[step] Built parts_list at %s (score=%.2f)",
250
                    pl_candidate.bbox,
251
                    pl_candidate.score,
252
                )
253
            except Exception as e:
×
254
                log.debug(
×
255
                    "[step] Failed to construct parts_list candidate at %s: %s",
256
                    pl_candidate.bbox,
257
                    e,
258
                )
259

260
        # Phase 3: Build all arrows BEFORE subassemblies.
261
        # Arrows connect subassembly callout boxes to main diagrams. The arrow
262
        # shaft may be a thin rectangle that overlaps the subassembly's white
263
        # box. By building arrows first, they claim the shaft blocks before
264
        # subassemblies can consume them.
265
        for arrow_candidate in result.get_scored_candidates(
1✔
266
            "arrow", valid_only=False, exclude_failed=True
267
        ):
268
            try:
1✔
269
                result.build(arrow_candidate)
1✔
270
                log.debug(
1✔
271
                    "[step] Built arrow at %s (score=%.2f)",
272
                    arrow_candidate.bbox,
273
                    arrow_candidate.score,
274
                )
275
            except Exception as e:
×
276
                log.debug(
×
277
                    "[step] Failed to construct arrow candidate at %s: %s",
278
                    arrow_candidate.bbox,
279
                    e,
280
                )
281

282
        # Phase 4: Build subassemblies and previews BEFORE steps.
283
        # Both subassemblies and previews are white boxes with diagrams inside.
284
        # We combine them and build in score order so the higher-scoring
285
        # candidate claims the white box first. When a candidate is built,
286
        # its source_blocks are marked as consumed, causing any competing
287
        # candidate using the same blocks to fail.
288
        #
289
        # This allows subassemblies (which have step_count labels like "2x")
290
        # to be distinguished from previews (which appear before steps).
291
        subassembly_candidates = result.get_scored_candidates(
1✔
292
            "subassembly", valid_only=False, exclude_failed=True
293
        )
294
        preview_candidates = result.get_scored_candidates(
1✔
295
            "preview", valid_only=False, exclude_failed=True
296
        )
297

298
        # Combine and sort by score (highest first)
299
        combined_candidates = list(subassembly_candidates) + list(preview_candidates)
1✔
300
        combined_candidates.sort(key=lambda c: c.score, reverse=True)
1✔
301

302
        for candidate in combined_candidates:
1✔
303
            try:
1✔
304
                result.build(candidate)
1✔
305
                log.debug(
1✔
306
                    "[step] Built %s at %s (score=%.2f)",
307
                    candidate.label,
308
                    candidate.bbox,
309
                    candidate.score,
310
                )
311
            except Exception as e:
1✔
312
                log.debug(
1✔
313
                    "[step] Failed to construct %s candidate at %s: %s",
314
                    candidate.label,
315
                    candidate.bbox,
316
                    e,
317
                )
318

319
        # Phase 5: Build Step candidates with deduplication by step value
320
        # Filter out candidates that look like substeps (small step numbers with
321
        # a significant gap from main step numbers). These are handled by
322
        # SubStepClassifier instead.
323
        # Steps are built as "partial" - just step_number + parts_list.
324
        # Diagrams and subassemblies are assigned later via Hungarian matching.
325
        all_step_candidates = result.get_scored_candidates(
1✔
326
            "step", valid_only=False, exclude_failed=True
327
        )
328
        page_level_step_candidates = self._filter_page_level_step_candidates(
1✔
329
            all_step_candidates
330
        )
331

332
        # Sort by score (highest first) so best candidates get built first
333
        sorted_candidates = sorted(
1✔
334
            page_level_step_candidates,
335
            key=lambda c: c.score,
336
            reverse=True,
337
        )
338

339
        steps: list[Step] = []
1✔
340
        built_step_values: set[int] = set()
1✔
341

342
        for step_candidate in sorted_candidates:
1✔
343
            # Get step value from score
344
            score = step_candidate.score_details
1✔
345
            if not isinstance(score, _StepScore):
1✔
346
                continue
×
347

348
            # Skip if we've already built a step with this value
349
            if score.step_value in built_step_values:
1✔
350
                log.debug(
1✔
351
                    "[step] Skipping duplicate step %d candidate (score=%.2f)",
352
                    score.step_value,
353
                    step_candidate.score,
354
                )
355
                continue
1✔
356

357
            try:
1✔
358
                elem = result.build(step_candidate)
1✔
359
                assert isinstance(elem, Step)
1✔
360
                steps.append(elem)
1✔
361
                built_step_values.add(score.step_value)
1✔
362
                log.debug(
1✔
363
                    "[step] Built partial step %d (parts_list=%s, score=%.2f)",
364
                    score.step_value,
365
                    "yes" if score.parts_list_candidate is not None else "no",
366
                    step_candidate.score,
367
                )
368
            except Exception as e:
1✔
369
                log.debug(
1✔
370
                    "[step] Failed to construct step %d candidate: %s",
371
                    score.step_value,
372
                    e,
373
                )
374

375
        # Sort steps by their step_number value
376
        steps.sort(key=lambda step: step.step_number.value)
1✔
377

378
        # Phase 6: Build diagrams (not yet claimed by subassemblies)
379
        # Phase 7: Assign diagrams to steps using Hungarian matching
380
        # Collect available diagram candidates (not yet claimed by subassemblies)
381
        available_diagrams: list[Diagram] = []
1✔
382
        for diag_candidate in result.get_scored_candidates(
1✔
383
            "diagram", valid_only=False, exclude_failed=True
384
        ):
385
            if diag_candidate.constructed is None:
1✔
386
                # Build the diagram
387
                try:
1✔
388
                    diag_elem = result.build(diag_candidate)
1✔
389
                    assert isinstance(diag_elem, Diagram)
1✔
390
                    available_diagrams.append(diag_elem)
1✔
391
                except Exception as e:
1✔
392
                    log.debug(
1✔
393
                        "[step] Failed to build diagram at %s: %s",
394
                        diag_candidate.bbox,
395
                        e,
396
                    )
397
            elif isinstance(diag_candidate.constructed, Diagram):
1✔
398
                # Already built (claimed by subassembly) - skip
399
                pass
1✔
400

401
        log.debug(
1✔
402
            "[step] Available diagrams for step assignment: %d",
403
            len(available_diagrams),
404
        )
405

406
        # Collect divider bboxes for obstruction checking (used by multiple phases)
407
        divider_bboxes: list[BBox] = []
1✔
408
        for div_candidate in result.get_scored_candidates("divider", valid_only=True):
1✔
409
            assert div_candidate.constructed is not None
1✔
410
            assert isinstance(div_candidate.constructed, Divider)
1✔
411
            divider_bboxes.append(div_candidate.constructed.bbox)
1✔
412

413
        # Assign diagrams to steps using Hungarian matching
414
        # Dividers prevent pairing across page divisions
415
        assign_diagrams_to_steps(steps, available_diagrams, divider_bboxes)
1✔
416

417
        # Phase 8: Assign subassemblies to steps using Hungarian matching
418
        # Collect built subassemblies
419
        subassemblies: list[SubAssembly] = []
1✔
420
        for sa_candidate in result.get_scored_candidates(
1✔
421
            "subassembly", valid_only=True
422
        ):
423
            assert sa_candidate.constructed is not None
1✔
424
            assert isinstance(sa_candidate.constructed, SubAssembly)
1✔
425
            subassemblies.append(sa_candidate.constructed)
1✔
426

427
        log.debug(
1✔
428
            "[step] Assigning %d subassemblies to %d steps (%d dividers for "
429
            "obstruction checking)",
430
            len(subassemblies),
431
            len(steps),
432
            len(divider_bboxes),
433
        )
434

435
        # Assign subassemblies to steps using Hungarian matching
436
        assign_subassemblies_to_steps(steps, subassemblies, divider_bboxes)
1✔
437

438
        # Phase 8b: Get unclaimed SubSteps as naked substeps
439
        # SubStepClassifier found small step number + diagram pairs.
440
        # SubAssemblyClassifier claimed those inside callout boxes.
441
        # The remaining ones become "naked" substeps for the main step.
442
        if steps:
1✔
443
            substeps = self._get_unclaimed_substeps(result)
1✔
444
            if substeps:
1✔
445
                # Assign all substeps to the single main step on the page
446
                # (In the future, we could use Hungarian matching if there are
447
                # multiple main steps, but for now this covers the common case)
448
                main_step = steps[0]  # Usually only one main step when substeps exist
1✔
449
                object.__setattr__(main_step, "substeps", substeps)
1✔
450
                log.debug(
1✔
451
                    "[step] Assigned %d substeps to step %d",
452
                    len(substeps),
453
                    main_step.step_number.value,
454
                )
455

456
        # Phase 9: Finalize steps - collect arrows and compute final bboxes
457
        page_bbox = result.page_data.bbox
1✔
458
        for step in steps:
1✔
459
            # Get arrows for this step
460
            arrows = self._get_arrows_for_step(step.step_number, step.diagram, result)
1✔
461

462
            # Compute final bbox including all components
463
            final_bbox = self._compute_step_bbox(
1✔
464
                step.step_number,
465
                step.parts_list,
466
                step.diagram,
467
                step.subassemblies,
468
                page_bbox,
469
            )
470

471
            # Update the step (Step is a Pydantic model, so we need to reassign)
472
            # We mutate in place by directly setting attributes
473
            object.__setattr__(step, "arrows", arrows)
1✔
474
            object.__setattr__(step, "bbox", final_bbox)
1✔
475

476
        # Phase 10: Assign rotation symbols to steps using Hungarian matching
477
        # Collect built rotation symbols
478
        rotation_symbols: list[RotationSymbol] = []
1✔
479
        for rs_candidate in result.get_scored_candidates(
1✔
480
            "rotation_symbol", valid_only=True
481
        ):
482
            assert rs_candidate.constructed is not None
1✔
483
            assert isinstance(rs_candidate.constructed, RotationSymbol)
1✔
484
            rotation_symbols.append(rs_candidate.constructed)
1✔
485

486
        assign_rotation_symbols_to_steps(steps, rotation_symbols)
1✔
487

488
        log.debug(
1✔
489
            "[step] build_all complete: %d steps, %d rotation symbols assigned",
490
            len(steps),
491
            sum(1 for s in steps if s.rotation_symbol is not None),
492
        )
493

494
        # Cast for type compatibility with base class return type
495
        return list[LegoPageElements](steps)
1✔
496

497
    def _filter_page_level_step_candidates(
1✔
498
        self, candidates: list[Candidate]
499
    ) -> list[Candidate]:
500
        """Filter step candidates to exclude likely substep candidates.
501

502
        When a page has both low-numbered steps (1, 2, 3, 4) and high-numbered
503
        steps (338), with a significant gap between them, the low numbers are
504
        likely substeps rather than main steps. These are handled by
505
        SubStepClassifier instead.
506

507
        Args:
508
            candidates: All step candidates to filter
509

510
        Returns:
511
            Page-level step candidates (excluding substep candidates)
512
        """
513
        # Extract step number values from candidates
514
        items_with_values: list[tuple[int, Candidate]] = []
1✔
515
        for candidate in candidates:
1✔
516
            score = candidate.score_details
1✔
517
            if not isinstance(score, _StepScore):
1✔
518
                continue
×
519
            items_with_values.append((score.step_value, candidate))
1✔
520

521
        # Use generic filtering function to find page-level steps
522
        filtered_items = filter_subassembly_values(items_with_values)
1✔
523

524
        # If filtering occurred, return only page-level candidates
525
        if len(filtered_items) != len(items_with_values):
1✔
526
            page_level_values = {v for v, _ in filtered_items}
1✔
527
            page_level_candidates = [
1✔
528
                item for v, item in items_with_values if v in page_level_values
529
            ]
530
            log.debug(
1✔
531
                "[step] Filtered to page-level candidates: %s (excluded substeps: %s)",
532
                sorted(page_level_values),
533
                sorted(v for v, _ in items_with_values if v not in page_level_values),
534
            )
535
            return page_level_candidates
1✔
536

537
        # No filtering occurred, return all as page-level
538
        return candidates
1✔
539

540
    def _get_unclaimed_substeps(
1✔
541
        self,
542
        result: ClassificationResult,
543
    ) -> list[SubStep]:
544
        """Get SubStep elements that weren't claimed by a SubAssembly.
545

546
        SubStepClassifier creates candidates for all small step number +
547
        diagram pairs. SubAssemblyClassifier then claims those inside callout boxes.
548
        This method returns the remaining unclaimed SubSteps for use as
549
        "naked" substeps in a Step.
550

551
        The substeps are filtered to only include valid sequences starting from 1.
552
        This helps filter out PieceLengths or other misidentified elements.
553

554
        Args:
555
            result: Classification result
556

557
        Returns:
558
            List of SubStep elements that weren't claimed by any SubAssembly,
559
            sorted by step number value, and filtered to valid sequences from 1.
560
        """
561
        from build_a_long.pdf_extract.classifier.steps.substep_classifier import (
1✔
562
            _SubStepScore,
563
        )
564

565
        # Get all substep candidates
566
        substep_candidates = result.get_scored_candidates(
1✔
567
            "substep", valid_only=False, exclude_failed=True
568
        )
569

570
        # Filter to only unclaimed candidates (not yet built)
571
        unclaimed_candidates = [c for c in substep_candidates if c.constructed is None]
1✔
572

573
        if not unclaimed_candidates:
1✔
574
            return []
1✔
575

576
        # Extract step values from candidates to filter BEFORE building
577
        def get_step_value(candidate: Candidate) -> int:
1✔
578
            """Get step value from a substep candidate's score."""
579
            score = candidate.score_details
1✔
580
            assert isinstance(score, _SubStepScore)
1✔
581

582
            return score.step_value
1✔
583

584
        # Filter candidates to valid sequential substeps (must start from 1)
585
        valid_candidates = filter_valid_substep_sequence(
1✔
586
            unclaimed_candidates, get_step_value
587
        )
588

589
        if not valid_candidates:
1✔
590
            log.debug("[step] No valid substep sequence found (no step 1)")
1✔
591
            return []
1✔
592

593
        # Now build only the filtered candidates
594
        substeps: list[SubStep] = []
1✔
595
        for candidate in valid_candidates:
1✔
596
            try:
1✔
597
                substep_elem = result.build(candidate)
1✔
598
                assert isinstance(substep_elem, SubStep)
1✔
599
                substeps.append(substep_elem)
1✔
600
            except CandidateFailedError:
1✔
601
                # Already claimed - skip
602
                continue
1✔
603

604
        # Sort by step number value
605
        substeps.sort(key=lambda s: s.step_number.value)
1✔
606

607
        log.debug(
1✔
608
            "[step] Found %d unclaimed substeps (after sequence filtering)",
609
            len(substeps),
610
        )
611

612
        return substeps
1✔
613

614
    def build(self, candidate: Candidate, result: ClassificationResult) -> Step:
1✔
615
        """Construct a partial Step element from a single candidate.
616

617
        This creates a Step with just step_number and parts_list. The diagram,
618
        subassemblies, and arrows are assigned later in build_all() using
619
        Hungarian matching for optimal global assignment.
620
        """
621
        score = candidate.score_details
1✔
622
        assert isinstance(score, _StepScore)
1✔
623

624
        # Validate and extract step number from parent candidate
625
        step_num_candidate = score.step_number_candidate
1✔
626

627
        step_num_elem = result.build(step_num_candidate)
1✔
628
        assert isinstance(step_num_elem, StepNumber)
1✔
629
        step_num = step_num_elem
1✔
630

631
        # Validate and extract parts list from parent candidate (if present)
632
        parts_list = None
1✔
633
        if score.parts_list_candidate:
1✔
634
            parts_list_candidate = score.parts_list_candidate
1✔
635
            parts_list_elem = result.build(parts_list_candidate)
1✔
636
            assert isinstance(parts_list_elem, PartsList)
1✔
637
            parts_list = parts_list_elem
1✔
638

639
        # Create partial step - diagram, subassemblies, arrows assigned later
640
        initial_bbox = step_num.bbox
1✔
641
        if parts_list:
1✔
642
            initial_bbox = initial_bbox.union(parts_list.bbox)
1✔
643

644
        return Step(
1✔
645
            bbox=initial_bbox,
646
            step_number=step_num,
647
            parts_list=parts_list,
648
            diagram=None,
649
            rotation_symbol=None,
650
            arrows=[],
651
            subassemblies=[],
652
        )
653

654
    def _find_and_build_diagram_for_step(
1✔
655
        self,
656
        step_num: StepNumber,
657
        parts_list: PartsList | None,
658
        result: ClassificationResult,
659
    ) -> Diagram | None:
660
        """Find and build the best diagram for this step.
661

662
        This is called at build time, after rotation symbols and subassemblies
663
        have been built (in build_all Phases 1 and 3), so they've already
664
        claimed any images/diagrams they need. We filter to only consider
665
        diagram candidates that haven't been constructed yet.
666

667
        Args:
668
            step_num: The built step number element
669
            parts_list: The built parts list element (if any)
670
            result: Classification result containing diagram candidates
671

672
        Returns:
673
            The built Diagram element, or None if no suitable diagram found
674
        """
675
        # Get all non-failed, non-constructed diagram candidates
676
        diagram_candidates = result.get_scored_candidates(
×
677
            "diagram", valid_only=False, exclude_failed=True
678
        )
679

680
        # Filter to only candidates that haven't been built yet
681
        # (subassemblies and rotation symbols built earlier may have claimed some)
682
        available_candidates = [c for c in diagram_candidates if c.constructed is None]
×
683

684
        if not available_candidates:
×
685
            log.debug(
×
686
                "[step] No diagram candidates available for step %d",
687
                step_num.value,
688
            )
689
            return None
×
690

691
        # Score each candidate based on position relative to step
692
        step_bbox = step_num.bbox
×
693
        best_candidate = None
×
694
        best_score = -float("inf")
×
695

696
        for candidate in available_candidates:
×
697
            score = self._score_step_diagram_pair(step_bbox, candidate.bbox)
×
698

699
            log.debug(
×
700
                "[step] Diagram candidate at %s for step %d: score=%.2f",
701
                candidate.bbox,
702
                step_num.value,
703
                score,
704
            )
705

706
            if score > best_score:
×
707
                best_score = score
×
708
                best_candidate = candidate
×
709

710
        if best_candidate is None or best_score < 0.2:
×
711
            log.debug(
×
712
                "[step] No suitable diagram found for step %d (best_score=%.2f)",
713
                step_num.value,
714
                best_score,
715
            )
716
            return None
×
717

718
        # Build the diagram
719
        try:
×
720
            diagram_elem = result.build(best_candidate)
×
721
            assert isinstance(diagram_elem, Diagram)
×
722
            log.debug(
×
723
                "[step] Built diagram at %s for step %d (score=%.2f)",
724
                diagram_elem.bbox,
725
                step_num.value,
726
                best_score,
727
            )
728
            return diagram_elem
×
729
        except CandidateFailedError as e:
×
730
            log.debug(
×
731
                "[step] Failed to build diagram for step %d: %s",
732
                step_num.value,
733
                e,
734
            )
735
            return None
×
736

737
    def _create_step_candidate(
1✔
738
        self,
739
        step_candidate: Candidate,
740
        parts_list_candidate: Candidate | None,
741
        result: ClassificationResult,
742
    ) -> Candidate | None:
743
        """Create a Step candidate (without diagram assignment).
744

745
        Diagrams are found at build time, not during scoring, to allow
746
        rotation symbols to claim small images first.
747

748
        Args:
749
            step_candidate: The StepNumber candidate for this step
750
            parts_list_candidate: The PartsList candidate to pair with (or None)
751
            result: Classification result
752

753
        Returns:
754
            The created Candidate with score but no construction, or None if
755
            the step number value cannot be extracted.
756
        """
757
        ABOVE_EPS = 2.0  # Small epsilon for "above" check
1✔
758
        ALIGNMENT_THRESHOLD_MULTIPLIER = 1.0  # Max horizontal offset
1✔
759
        DISTANCE_THRESHOLD_MULTIPLIER = 1.0  # Max vertical distance
1✔
760

761
        # Extract step number value from the candidate
762
        if not step_candidate.source_blocks:
1✔
763
            return None
×
764
        source_block = step_candidate.source_blocks[0]
1✔
765
        if not isinstance(source_block, Text):
1✔
766
            return None
×
767
        step_value = extract_step_number_value(source_block.text)
1✔
768
        if step_value is None:
1✔
769
            return None
×
770

771
        step_bbox = step_candidate.bbox
1✔
772
        parts_list_bbox = parts_list_candidate.bbox if parts_list_candidate else None
1✔
773

774
        # Calculate pairing scores if there's a parts_list above the step
775
        proximity_score = 0.0
1✔
776
        alignment_score = 0.0
1✔
777

778
        if (
1✔
779
            parts_list_bbox is not None
780
            and parts_list_bbox.y1 <= step_bbox.y0 + ABOVE_EPS
781
        ):
782
            # Calculate distance (how far apart vertically)
783
            distance = step_bbox.y0 - parts_list_bbox.y1
1✔
784

785
            # Calculate proximity score
786
            max_distance = step_bbox.height * DISTANCE_THRESHOLD_MULTIPLIER
1✔
787
            if max_distance > 0:
1✔
788
                proximity_score = max(0.0, 1.0 - (distance / max_distance))
1✔
789

790
            # Calculate alignment score (how well left edges align)
791
            max_alignment_diff = step_bbox.width * ALIGNMENT_THRESHOLD_MULTIPLIER
1✔
792
            left_diff = abs(parts_list_bbox.x0 - step_bbox.x0)
1✔
793
            if max_alignment_diff > 0:
1✔
794
                alignment_score = max(0.0, 1.0 - (left_diff / max_alignment_diff))
1✔
795

796
        # Create score object with candidate references
797
        # Diagrams are found at build time, not during scoring
798
        score = _StepScore(
1✔
799
            step_value=step_value,
800
            step_number_candidate=step_candidate,
801
            parts_list_candidate=parts_list_candidate,
802
            has_parts_list=parts_list_candidate is not None,
803
            step_proximity_score=proximity_score,
804
            step_alignment_score=alignment_score,
805
        )
806

807
        # Calculate combined bbox for the candidate (without diagram)
808
        combined_bbox = step_bbox
1✔
809
        if parts_list_bbox:
1✔
810
            combined_bbox = BBox.union(combined_bbox, parts_list_bbox)
1✔
811

812
        # Create candidate
813
        return Candidate(
1✔
814
            bbox=combined_bbox,
815
            label="step",
816
            score=score.overall_score(),
817
            score_details=score,
818
            source_blocks=[],
819
        )
820

821
    def _score_step_diagram_pair(
1✔
822
        self,
823
        step_bbox: BBox,
824
        diagram_bbox: BBox,
825
    ) -> float:
826
        """Score how well a diagram matches a step.
827

828
        Diagrams are typically positioned to the right of and/or below the step
829
        number. This method scores based on:
830
        - Horizontal position: prefer diagrams to the right, penalize left
831
        - Vertical position: prefer diagrams below the step header
832
        - Distance: closer is better
833

834
        Args:
835
            step_bbox: The step number bounding box
836
            diagram_bbox: The diagram bounding box to score
837

838
        Returns:
839
            Score between 0.0 and 1.0 (higher is better match)
840
        """
841
        # Reference point: bottom-right of step number
842
        ref_x = step_bbox.x1
×
843
        ref_y = step_bbox.y1
×
844

845
        # TODO Move all these constants into config, or make them adaptive?
846

847
        # Horizontal score
848
        # Diagrams to the right are preferred, but allow some overlap
849
        x_offset = diagram_bbox.x0 - ref_x
×
850

851
        if x_offset >= -50:
×
852
            # Diagram starts to the right or slightly overlapping - good
853
            # Score decreases slightly with distance to the right
854
            x_score = max(0.5, 1.0 - abs(x_offset) / 400.0)
×
855
        elif x_offset >= -200:
×
856
            # Diagram is moderately to the left - acceptable
857
            x_score = 0.3 + 0.2 * (1.0 + x_offset / 200.0)
×
858
        else:
859
            # Diagram is far to the left - poor match
860
            x_score = max(0.1, 0.3 + x_offset / 400.0)
×
861

862
        # Vertical score
863
        # Diagrams below the step header are preferred
864
        y_offset = diagram_bbox.y0 - ref_y
×
865

866
        if y_offset >= -30:
×
867
            # Diagram starts below or slightly overlapping - good
868
            # Score decreases with vertical distance
869
            y_score = max(0.3, 1.0 - abs(y_offset) / 300.0)
×
870
        elif y_offset >= -100:
×
871
            # Diagram is moderately above - less good but acceptable
872
            y_score = 0.2 + 0.3 * (1.0 + y_offset / 100.0)
×
873
        else:
874
            # Diagram is far above - poor match
875
            y_score = max(0.05, 0.2 + y_offset / 300.0)
×
876

877
        # Combined score - weight both dimensions equally
878
        score = 0.5 * x_score + 0.5 * y_score
×
879

880
        return score
×
881

882
    def _get_rotation_symbol_for_step(
1✔
883
        self,
884
        step_bbox: BBox,
885
        result: ClassificationResult,
886
    ) -> RotationSymbol | None:
887
        """Find an already-built rotation symbol within this step's area.
888

889
        Rotation symbols are built by PageClassifier before steps are processed.
890
        This method finds the already-built rotation symbol that falls within
891
        the step's bounding box.
892

893
        Args:
894
            step_bbox: The step's bounding box (including step_num, parts_list,
895
                and diagram)
896
            result: Classification result containing rotation symbol candidates
897

898
        Returns:
899
            Single RotationSymbol element for this step, or None if not found
900
        """
901
        # Get rotation_symbol candidates - they should already be built
902
        # by PageClassifier. Use valid_only=True to only get successfully
903
        # constructed rotation symbols.
904
        rotation_symbol_candidates = result.get_scored_candidates(
×
905
            "rotation_symbol", valid_only=True
906
        )
907

908
        log.debug(
×
909
            "[step] Looking for rotation symbols in step bbox %s, "
910
            "found %d built candidates",
911
            step_bbox,
912
            len(rotation_symbol_candidates),
913
        )
914

915
        if not rotation_symbol_candidates:
×
916
            return None
×
917

918
        # Find best-scoring rotation symbol overlapping the step's bbox
919
        overlapping_candidates = filter_overlapping(
×
920
            rotation_symbol_candidates, step_bbox
921
        )
922
        best_candidate = find_best_scoring(overlapping_candidates)
×
923

924
        if best_candidate and best_candidate.constructed is not None:
×
925
            rotation_symbol = best_candidate.constructed
×
926
            assert isinstance(rotation_symbol, RotationSymbol)
×
927
            log.debug(
×
928
                "[step] Found rotation symbol at %s (score=%.2f)",
929
                rotation_symbol.bbox,
930
                best_candidate.score,
931
            )
932
            return rotation_symbol
×
933

934
        log.debug("[step] No rotation symbol found in step bbox %s", step_bbox)
×
935
        return None
×
936

937
    def _get_arrows_for_step(
1✔
938
        self,
939
        step_num: StepNumber,
940
        diagram: Diagram | None,
941
        result: ClassificationResult,
942
    ) -> list[Arrow]:
943
        """Collect already-built arrows that belong to this step.
944

945
        Arrows are built in Phase 3 of build_all(), before subassemblies.
946
        This method finds arrows that are positioned near the step's diagram
947
        and assigns them to this step.
948

949
        Typically these are arrows pointing from subassembly callout boxes
950
        to the main diagram.
951

952
        Args:
953
            step_num: The step number element
954
            diagram: The diagram element (if any)
955
            result: Classification result containing arrow candidates
956

957
        Returns:
958
            List of Arrow elements for this step
959
        """
960
        # Get already-built arrows (built in Phase 3)
961
        arrow_candidates = result.get_scored_candidates(
1✔
962
            "arrow",
963
            valid_only=True,  # Only get successfully built arrows
964
        )
965

966
        log.debug(
1✔
967
            "[step] Looking for arrows for step %d, found %d built arrows",
968
            step_num.value,
969
            len(arrow_candidates),
970
        )
971

972
        if not arrow_candidates:
1✔
973
            return []
1✔
974

975
        # Determine search region: prefer diagram area, fallback to step area
976
        search_bbox = diagram.bbox if diagram else step_num.bbox
1✔
977

978
        # Expand search region to catch arrows near the diagram
979
        # Use a larger margin than rotation symbols since arrows can extend further
980
        search_region = search_bbox.expand(100.0)
1✔
981

982
        log.debug(
1✔
983
            "[step] Arrow search region for step %d: %s",
984
            step_num.value,
985
            search_region,
986
        )
987

988
        # Find arrows within or overlapping the search region
989
        arrows: list[Arrow] = []
1✔
990
        overlapping_candidates = filter_overlapping(arrow_candidates, search_region)
1✔
991

992
        for candidate in overlapping_candidates:
1✔
993
            assert candidate.constructed is not None
1✔
994
            assert isinstance(candidate.constructed, Arrow)
1✔
995
            log.debug(
1✔
996
                "[step]   Arrow at %s overlaps search region (score=%.2f)",
997
                candidate.bbox,
998
                candidate.score,
999
            )
1000
            arrows.append(candidate.constructed)
1✔
1001

1002
        log.debug(
1✔
1003
            "[step] Found %d arrows for step %d",
1004
            len(arrows),
1005
            step_num.value,
1006
        )
1007
        return arrows
1✔
1008

1009
    def _get_subassemblies_for_step(
1✔
1010
        self,
1011
        step_num: StepNumber,
1012
        diagram: Diagram | None,
1013
        result: ClassificationResult,
1014
    ) -> list[SubAssembly]:
1015
        """Get already-built subassemblies that belong to this step.
1016

1017
        Subassemblies are built in build_all() before steps. This method finds
1018
        subassemblies that are positioned near this step's diagram and haven't
1019
        been claimed by another step yet.
1020

1021
        Args:
1022
            step_num: The step number element
1023
            diagram: The diagram element (if any)
1024
            result: Classification result containing subassembly candidates
1025

1026
        Returns:
1027
            List of SubAssembly elements for this step
1028
        """
1029
        # Get all built subassemblies
1030
        all_subassemblies: list[SubAssembly] = []
×
1031
        for sa_candidate in result.get_scored_candidates(
×
1032
            "subassembly", valid_only=True
1033
        ):
1034
            assert sa_candidate.constructed is not None
×
1035
            assert isinstance(sa_candidate.constructed, SubAssembly)
×
1036
            all_subassemblies.append(sa_candidate.constructed)
×
1037

1038
        log.debug(
×
1039
            "[step] Looking for subassemblies for step %d, found %d built",
1040
            step_num.value,
1041
            len(all_subassemblies),
1042
        )
1043

1044
        if not all_subassemblies:
×
1045
            return []
×
1046

1047
        # Determine search region based on step_number and diagram
1048
        reference_bbox = diagram.bbox.union(step_num.bbox) if diagram else step_num.bbox
×
1049

1050
        page_bbox = result.page_data.bbox
×
1051

1052
        # Expand search region around the reference area
1053
        # Horizontally: search the full page width
1054
        # Vertically: search a region around the reference bbox
1055
        vertical_margin = reference_bbox.height
×
1056
        search_region = BBox(
×
1057
            x0=page_bbox.x0,
1058
            y0=max(page_bbox.y0, reference_bbox.y0 - vertical_margin),
1059
            x1=page_bbox.x1,
1060
            y1=min(page_bbox.y1, reference_bbox.y1 + vertical_margin),
1061
        )
1062

1063
        log.debug(
×
1064
            "[step] SubAssembly search region for step %d: %s",
1065
            step_num.value,
1066
            search_region,
1067
        )
1068

1069
        # Find subassemblies that overlap the search region
1070
        subassemblies: list[SubAssembly] = []
×
1071
        for subassembly in all_subassemblies:
×
1072
            if subassembly.bbox.overlaps(search_region):
×
1073
                log.debug(
×
1074
                    "[step]   SubAssembly at %s overlaps search region",
1075
                    subassembly.bbox,
1076
                )
1077
                subassemblies.append(subassembly)
×
1078

1079
        log.debug(
×
1080
            "[step] Found %d subassemblies for step %d",
1081
            len(subassemblies),
1082
            step_num.value,
1083
        )
1084
        return subassemblies
×
1085

1086
    def _compute_step_bbox(
1✔
1087
        self,
1088
        step_num: StepNumber,
1089
        parts_list: PartsList | None,
1090
        diagram: Diagram | None,
1091
        subassemblies: list[SubAssembly],
1092
        page_bbox: BBox,
1093
    ) -> BBox:
1094
        """Compute the overall bounding box for the Step.
1095

1096
        This encompasses the step number, parts list (if any), diagram (if any),
1097
        and all subassemblies.
1098
        The result is clipped to the page bounds to handle elements that extend
1099
        slightly off-page (e.g., arrows in diagrams).
1100

1101
        Args:
1102
            step_num: The step number element
1103
            parts_list: The parts list (if any)
1104
            diagram: The diagram element (if any)
1105
            subassemblies: List of subassemblies for this step
1106
            page_bbox: The page bounding box to clip to
1107

1108
        Returns:
1109
            Combined bounding box, clipped to page bounds
1110
        """
1111
        bboxes = [step_num.bbox]
1✔
1112
        if parts_list:
1✔
1113
            bboxes.append(parts_list.bbox)
1✔
1114
        if diagram:
1✔
1115
            bboxes.append(diagram.bbox)
1✔
1116
        for sa in subassemblies:
1✔
1117
            bboxes.append(sa.bbox)
1✔
1118

1119
        return BBox.union_all(bboxes).clip_to(page_bbox)
1✔
1120

1121

1122
def assign_diagrams_to_steps(
1✔
1123
    steps: list[Step],
1124
    diagrams: list[Diagram],
1125
    divider_bboxes: Sequence[BBox] = (),
1126
    max_distance: float = 500.0,
1127
) -> None:
1128
    """Assign diagrams to steps using Hungarian algorithm.
1129

1130
    Uses the shared pairing module to find optimal step number to diagram
1131
    pairings based on:
1132
    - Position: step number should be in top-left of diagram
1133
    - Distance: closer is better
1134
    - Dividers: pairings should not cross divider lines
1135

1136
    This function mutates the Step objects in place, setting their diagram
1137
    attribute.
1138

1139
    Args:
1140
        steps: List of Step objects to assign diagrams to
1141
        diagrams: List of Diagram objects to assign
1142
        divider_bboxes: Sequence of divider bounding boxes to check for crossing
1143
        max_distance: Maximum distance for a valid assignment.
1144
    """
1145
    if not steps or not diagrams:
1✔
1146
        log.debug(
1✔
1147
            "[step] No diagram assignment needed: steps=%d, diagrams=%d",
1148
            len(steps),
1149
            len(diagrams),
1150
        )
1151
        return
1✔
1152

1153
    log.debug(
1✔
1154
        "[step] Running Hungarian matching for diagrams: %d steps, %d diagrams",
1155
        len(steps),
1156
        len(diagrams),
1157
    )
1158

1159
    # Configure pairing
1160
    config = PairingConfig(
1✔
1161
        max_distance=max_distance,
1162
        position_weight=0.5,
1163
        distance_weight=0.5,
1164
        check_dividers=True,
1165
        top_left_tolerance=100.0,
1166
    )
1167

1168
    # Extract bboxes
1169
    step_bboxes = [s.step_number.bbox for s in steps]
1✔
1170
    diagram_bboxes = [d.bbox for d in diagrams]
1✔
1171

1172
    # Find optimal pairings using shared logic
1173
    pairings = find_optimal_pairings(
1✔
1174
        step_bboxes, diagram_bboxes, config, divider_bboxes
1175
    )
1176

1177
    # Assign diagrams to steps based on the matching
1178
    for pairing in pairings:
1✔
1179
        step = steps[pairing.step_index]
1✔
1180
        diag = diagrams[pairing.diagram_index]
1✔
1181
        # Use object.__setattr__ because Step is a frozen Pydantic model
1182
        object.__setattr__(step, "diagram", diag)
1✔
1183
        log.debug(
1✔
1184
            "[step] Assigned diagram at %s to step %d (cost=%.2f)",
1185
            diag.bbox,
1186
            step.step_number.value,
1187
            pairing.cost,
1188
        )
1189

1190

1191
def assign_subassemblies_to_steps(
1✔
1192
    steps: list[Step],
1193
    subassemblies: list[SubAssembly],
1194
    divider_bboxes: Sequence[BBox] = (),
1195
    max_distance: float = 400.0,
1196
) -> None:
1197
    """Assign subassemblies to steps using Hungarian algorithm.
1198

1199
    Uses optimal bipartite matching to pair subassemblies with steps based on:
1200
    - Distance from step's diagram to subassembly (closer is better)
1201
    - No divider between the subassembly and the step's diagram
1202

1203
    This function mutates the Step objects in place, adding to their
1204
    subassemblies list.
1205

1206
    Args:
1207
        steps: List of Step objects to assign subassemblies to
1208
        subassemblies: List of SubAssembly objects to assign
1209
        divider_bboxes: Sequence of divider bounding boxes for obstruction checking
1210
        max_distance: Maximum distance for a valid assignment
1211
    """
1212
    if not steps or not subassemblies:
1✔
1213
        log.debug(
1✔
1214
            "[step] No subassembly assignment needed: steps=%d, subassemblies=%d",
1215
            len(steps),
1216
            len(subassemblies),
1217
        )
1218
        return
1✔
1219

1220
    n_steps = len(steps)
1✔
1221
    n_subassemblies = len(subassemblies)
1✔
1222

1223
    log.debug(
1✔
1224
        "[step] Running Hungarian matching for subassemblies: "
1225
        "%d steps, %d subassemblies",
1226
        n_steps,
1227
        n_subassemblies,
1228
    )
1229

1230
    # Build cost matrix: rows = subassemblies, cols = steps
1231
    cost_matrix = np.zeros((n_subassemblies, n_steps))
1✔
1232
    high_cost = max_distance * 10
1✔
1233

1234
    for i, sa in enumerate(subassemblies):
1✔
1235
        sa_center = sa.bbox.center
1✔
1236
        for j, step in enumerate(steps):
1✔
1237
            # Use diagram center if available, otherwise step bbox center
1238
            if step.diagram:
1✔
1239
                target_bbox = step.diagram.bbox
1✔
1240
                target_center = target_bbox.center
1✔
1241
            else:
1242
                target_bbox = step.bbox
×
1243
                target_center = step.step_number.bbox.center
×
1244

1245
            # Base cost is distance from diagram/step to subassembly center
1246
            distance = (
1✔
1247
                (target_center[0] - sa_center[0]) ** 2
1248
                + (target_center[1] - sa_center[1]) ** 2
1249
            ) ** 0.5
1250

1251
            # Check for divider between subassembly and step's diagram
1252
            if has_divider_between(sa.bbox, target_bbox, divider_bboxes):
1✔
1253
                # High cost if there's a divider between
1254
                distance = high_cost
1✔
1255
                log.debug(
1✔
1256
                    "[step]   Divider between subassembly[%d] at %s and step[%d]",
1257
                    i,
1258
                    sa.bbox,
1259
                    step.step_number.value,
1260
                )
1261

1262
            cost_matrix[i, j] = distance
1✔
1263
            log.debug(
1✔
1264
                "[step]   Cost subassembly[%d] at %s to step[%d]: %.1f",
1265
                i,
1266
                sa.bbox,
1267
                step.step_number.value,
1268
                distance,
1269
            )
1270

1271
    # Apply max_distance threshold
1272
    cost_matrix_thresholded = np.where(
1✔
1273
        cost_matrix > max_distance, high_cost, cost_matrix
1274
    )
1275

1276
    # Run Hungarian algorithm
1277
    row_indices, col_indices = linear_sum_assignment(cost_matrix_thresholded)
1✔
1278

1279
    # Assign subassemblies to steps based on the matching
1280
    # Note: Unlike diagrams, multiple subassemblies can be assigned to one step
1281
    # The Hungarian algorithm gives us the best 1:1 matching, but we may want
1282
    # to assign additional nearby subassemblies too
1283

1284
    # First, do the 1:1 optimal assignment
1285
    assigned_subassemblies: set[int] = set()
1✔
1286
    for row_idx, col_idx in zip(row_indices, col_indices, strict=True):
1✔
1287
        if cost_matrix[row_idx, col_idx] <= max_distance:
1✔
1288
            sa = subassemblies[row_idx]
1✔
1289
            step = steps[col_idx]
1✔
1290
            # Add to step's subassemblies list
1291
            new_subassemblies = list(step.subassemblies) + [sa]
1✔
1292
            object.__setattr__(step, "subassemblies", new_subassemblies)
1✔
1293
            assigned_subassemblies.add(row_idx)
1✔
1294
            log.debug(
1✔
1295
                "[step] Assigned subassembly at %s to step %d (cost=%.1f)",
1296
                sa.bbox,
1297
                step.step_number.value,
1298
                cost_matrix[row_idx, col_idx],
1299
            )
1300

1301
    # Then, try to assign remaining unassigned subassemblies to their nearest step
1302
    # (if within max_distance and no divider between)
1303
    for i, sa in enumerate(subassemblies):
1✔
1304
        if i in assigned_subassemblies:
1✔
1305
            continue
1✔
1306

1307
        # Find the step with lowest cost for this subassembly
1308
        best_step_idx = None
1✔
1309
        best_cost = high_cost
1✔
1310
        for j in range(len(steps)):
1✔
1311
            if cost_matrix[i, j] < best_cost:
1✔
1312
                best_cost = cost_matrix[i, j]
1✔
1313
                best_step_idx = j
1✔
1314

1315
        if best_step_idx is not None and best_cost <= max_distance:
1✔
1316
            step = steps[best_step_idx]
1✔
1317
            new_subassemblies = list(step.subassemblies) + [sa]
1✔
1318
            object.__setattr__(step, "subassemblies", new_subassemblies)
1✔
1319
            log.debug(
1✔
1320
                "[step] Assigned extra subassembly at %s to step %d (cost=%.1f)",
1321
                sa.bbox,
1322
                step.step_number.value,
1323
                best_cost,
1324
            )
1325

1326

1327
def assign_rotation_symbols_to_steps(
1✔
1328
    steps: list[Step],
1329
    rotation_symbols: list[RotationSymbol],
1330
    max_distance: float = 300.0,
1331
) -> None:
1332
    """Assign rotation symbols to steps using Hungarian algorithm.
1333

1334
    Uses optimal bipartite matching to pair rotation symbols with steps based on
1335
    minimum distance from the rotation symbol to the step's diagram (or step bbox
1336
    center if no diagram).
1337

1338
    This function mutates the Step objects in place, setting their rotation_symbol
1339
    attribute.
1340

1341
    Args:
1342
        steps: List of Step objects to assign rotation symbols to
1343
        rotation_symbols: List of RotationSymbol objects to assign
1344
        max_distance: Maximum distance for a valid assignment. Pairs with distance
1345
            greater than this will not be matched.
1346
    """
1347
    if not steps or not rotation_symbols:
1✔
1348
        log.debug(
1✔
1349
            "[step] No rotation symbol assignment needed: "
1350
            "steps=%d, rotation_symbols=%d",
1351
            len(steps),
1352
            len(rotation_symbols),
1353
        )
1354
        return
1✔
1355

1356
    n_steps = len(steps)
1✔
1357
    n_symbols = len(rotation_symbols)
1✔
1358

1359
    log.debug(
1✔
1360
        "[step] Running Hungarian matching: %d steps, %d rotation symbols",
1361
        n_steps,
1362
        n_symbols,
1363
    )
1364

1365
    # Build cost matrix: rows = rotation symbols, cols = steps
1366
    # Cost = distance from rotation symbol center to step's diagram center
1367
    # (or step bbox center if no diagram)
1368
    cost_matrix = np.zeros((n_symbols, n_steps))
1✔
1369

1370
    for i, rs in enumerate(rotation_symbols):
1✔
1371
        rs_center = rs.bbox.center
1✔
1372
        for j, step in enumerate(steps):
1✔
1373
            # Use diagram center if available, otherwise step bbox center
1374
            if step.diagram:
1✔
1375
                target_center = step.diagram.bbox.center
1✔
1376
            else:
1377
                target_center = step.bbox.center
×
1378

1379
            # Euclidean distance
1380
            distance = (
1✔
1381
                (rs_center[0] - target_center[0]) ** 2
1382
                + (rs_center[1] - target_center[1]) ** 2
1383
            ) ** 0.5
1384

1385
            cost_matrix[i, j] = distance
1✔
1386
            log.debug(
1✔
1387
                "[step]   Distance from rotation_symbol[%d] at %s to step[%d]: %.1f",
1388
                i,
1389
                rs.bbox,
1390
                step.step_number.value,
1391
                distance,
1392
            )
1393

1394
    # Apply max_distance threshold by setting costs above threshold to a high value
1395
    # This prevents assignments that are too far apart
1396
    high_cost = max_distance * 10  # Arbitrary large value
1✔
1397
    cost_matrix_thresholded = np.where(
1✔
1398
        cost_matrix > max_distance, high_cost, cost_matrix
1399
    )
1400

1401
    # Run Hungarian algorithm
1402
    row_indices, col_indices = linear_sum_assignment(cost_matrix_thresholded)
1✔
1403

1404
    # Assign rotation symbols to steps based on the matching
1405
    for row_idx, col_idx in zip(row_indices, col_indices, strict=True):
1✔
1406
        # Check if this assignment is within the max_distance threshold
1407
        if cost_matrix[row_idx, col_idx] <= max_distance:
1✔
1408
            rs = rotation_symbols[row_idx]
1✔
1409
            step = steps[col_idx]
1✔
1410
            step.rotation_symbol = rs
1✔
1411
            # Expand step bbox to include the rotation symbol
1412
            step.bbox = step.bbox.union(rs.bbox)
1✔
1413
            log.debug(
1✔
1414
                "[step] Assigned rotation symbol at %s to step %d "
1415
                "(distance=%.1f, new step bbox=%s)",
1416
                rs.bbox,
1417
                step.step_number.value,
1418
                cost_matrix[row_idx, col_idx],
1419
                step.bbox,
1420
            )
1421
        else:
1422
            log.debug(
×
1423
                "[step] Skipped assignment: rotation_symbol[%d] to step[%d] "
1424
                "distance=%.1f > max_distance=%.1f",
1425
                row_idx,
1426
                col_indices[row_idx] if row_idx < len(col_indices) else -1,
1427
                cost_matrix[row_idx, col_idx],
1428
                max_distance,
1429
            )
1430

1431

1432
def filter_subassembly_values[T](
1✔
1433
    items: list[tuple[int, T]],
1434
    *,
1435
    min_gap: int = 3,
1436
    max_subassembly_start: int = 3,
1437
) -> list[tuple[int, T]]:
1438
    """Filter items to exclude likely subassembly values.
1439

1440
    Subassembly steps (e.g., step 1, 2 inside a subassembly box) often appear
1441
    alongside higher-numbered page-level steps (e.g., 15, 16). This creates
1442
    sequences like [1, 2, 15, 16] which cannot all be page-level steps.
1443

1444
    This function identifies such cases by detecting a significant gap in values
1445
    and filtering out the lower-numbered items that are likely subassembly steps.
1446

1447
    Args:
1448
        items: List of (value, item) tuples where value is the step number
1449
            and item is any associated data (e.g., a Candidate).
1450
        min_gap: Minimum gap size to trigger filtering (exclusive).
1451
            Default is 3, so gaps > 3 trigger filtering.
1452
        max_subassembly_start: Maximum starting value for subassembly detection.
1453
            If min value is <= this, filtering is applied. Default is 3.
1454

1455
    Returns:
1456
        Filtered list of (value, item) tuples, keeping only the higher values
1457
        if a subassembly pattern is detected. Returns original list if no
1458
        filtering is needed.
1459

1460
    Examples:
1461
        >>> filter_subassembly_values([(1, "a"), (2, "b"), (15, "c"), (16, "d")])
1462
        [(15, "c"), (16, "d")]
1463

1464
        >>> filter_subassembly_values([(5, "a"), (6, "b"), (15, "c")])
1465
        [(5, "a"), (6, "b"), (15, "c")]  # min=5 > 3, no filtering
1466
    """
1467
    if len(items) <= 1:
1✔
1468
        return items
1✔
1469

1470
    # Sort by value
1471
    sorted_items = sorted(items, key=lambda x: x[0])
1✔
1472
    values = [v for v, _ in sorted_items]
1✔
1473

1474
    # Find the largest gap between consecutive values
1475
    max_gap_size = 0
1✔
1476
    max_gap_index = -1
1✔
1477
    for i in range(len(values) - 1):
1✔
1478
        gap = values[i + 1] - values[i]
1✔
1479
        if gap > max_gap_size:
1✔
1480
            max_gap_size = gap
1✔
1481
            max_gap_index = i
1✔
1482

1483
    # Check if filtering should occur:
1484
    # 1. Gap must be larger than min_gap
1485
    # 2. The minimum value must be <= max_subassembly_start
1486
    if max_gap_size > min_gap and max_gap_index >= 0:
1✔
1487
        min_value = values[0]
1✔
1488
        if min_value <= max_subassembly_start:
1✔
1489
            threshold = values[max_gap_index + 1]
1✔
1490
            return [(v, item) for v, item in sorted_items if v >= threshold]
1✔
1491

1492
    return items
1✔
1493

1494

1495
def filter_valid_substep_sequence[T](
1✔
1496
    items: list[T],
1497
    get_value: Callable[[T], int],
1498
) -> list[T]:
1499
    """Filter substeps to only include valid sequential values starting from 1.
1500

1501
    Substeps within a subassembly or page must form a valid sequence:
1502
    - Must start from 1
1503
    - Values must be consecutive (1, 2, 3, ... not 1, 3, 5)
1504

1505
    This filters out isolated numbers that are likely PieceLengths or other
1506
    misidentified elements.
1507

1508
    Args:
1509
        items: List of substep items to filter
1510
        get_value: Function to extract the step number value from an item
1511

1512
    Returns:
1513
        Filtered list containing only items that form a valid sequence from 1.
1514
        Returns empty list if no valid sequence starting from 1 is found.
1515

1516
    Examples:
1517
        >>> filter_valid_substep_sequence([1, 2, 3], lambda x: x)
1518
        [1, 2, 3]
1519

1520
        >>> filter_valid_substep_sequence([3, 5], lambda x: x)
1521
        []  # No item with value 1
1522

1523
        >>> filter_valid_substep_sequence([1, 2, 5], lambda x: x)
1524
        [1, 2]  # 5 breaks the sequence
1525
    """
1526
    if not items:
1✔
1527
        return []
×
1528

1529
    # Build a map from value to items
1530
    value_to_items: dict[int, list[T]] = {}
1✔
1531
    for item in items:
1✔
1532
        value = get_value(item)
1✔
1533
        if value not in value_to_items:
1✔
1534
            value_to_items[value] = []
1✔
1535
        value_to_items[value].append(item)
1✔
1536

1537
    # Must have step 1 to be a valid sequence
1538
    if 1 not in value_to_items:
1✔
1539
        log.debug("[substep_filter] No substep with value 1 found - rejecting all")
1✔
1540
        return []
1✔
1541

1542
    # Collect items in sequence starting from 1
1543
    result: list[T] = []
1✔
1544
    expected_value = 1
1✔
1545

1546
    while expected_value in value_to_items:
1✔
1547
        result.extend(value_to_items[expected_value])
1✔
1548
        expected_value += 1
1✔
1549

1550
    log.debug(
1✔
1551
        "[substep_filter] Filtered %d items to %d valid sequential substeps (1-%d)",
1552
        len(items),
1553
        len(result),
1554
        expected_value - 1,
1555
    )
1556

1557
    return result
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc