• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 20389851973

20 Dec 2025 05:31AM UTC coverage: 89.185% (+0.04%) from 89.145%
20389851973

push

github

bramp
Add support for `ty` to the pyproject.toml.

13384 of 15007 relevant lines covered (89.19%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

80.8
/src/build_a_long/pdf_extract/classifier/steps/step_classifier.py
1
"""
2
Step classifier.
3

4
Purpose
5
-------
6
Identify complete Step structures by combining step_number, parts_list, and diagram
7
elements. A Step represents a single building instruction comprising:
8
- A StepNumber label
9
- An optional PartsList (the parts needed for this step)
10
- A Diagram (the main instruction graphic showing what to build)
11

12
We look for step_numbers and attempt to pair them with nearby parts_lists and
13
identify the appropriate diagram region for each step.
14

15
Debugging
16
---------
17
Set environment variables to aid investigation without code changes:
18

19
- LOG_LEVEL=DEBUG
20
    Enables DEBUG-level logging (if not already configured by caller).
21
"""
22

23
import logging
1✔
24
from collections.abc import Callable, Sequence
1✔
25

26
import numpy as np
1✔
27
from scipy.optimize import linear_sum_assignment
1✔
28

29
from build_a_long.pdf_extract.classifier.candidate import Candidate
1✔
30
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
31
    CandidateFailedError,
32
    ClassificationResult,
33
)
34
from build_a_long.pdf_extract.classifier.label_classifier import (
1✔
35
    LabelClassifier,
36
)
37
from build_a_long.pdf_extract.classifier.rule_based_classifier import (
1✔
38
    StepNumberScore,
39
)
40
from build_a_long.pdf_extract.classifier.score import (
1✔
41
    Score,
42
    Weight,
43
    find_best_scoring,
44
)
45
from build_a_long.pdf_extract.classifier.steps.pairing import (
1✔
46
    PairingConfig,
47
    find_optimal_pairings,
48
    has_divider_between,
49
)
50
from build_a_long.pdf_extract.classifier.steps.substep_classifier import _SubStepScore
1✔
51
from build_a_long.pdf_extract.extractor.bbox import BBox, filter_overlapping
1✔
52
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
53
    Arrow,
54
    Diagram,
55
    Divider,
56
    LegoPageElements,
57
    PartsList,
58
    RotationSymbol,
59
    Step,
60
    StepNumber,
61
    SubAssembly,
62
    SubStep,
63
)
64

65
log = logging.getLogger(__name__)
1✔
66

67

68
class _StepScore(Score):
1✔
69
    """Internal score representation for step classification."""
70

71
    step_value: int
72
    """The parsed step number value (e.g., 1, 2, 3)."""
1✔
73

74
    step_number_candidate: Candidate
75
    """The step number candidate this step is associated with."""
1✔
76

77
    parts_list_candidate: Candidate | None
78
    """The parts list candidate paired with this step (if any)."""
1✔
79

80
    has_parts_list: bool
81
    """Whether this step has an associated parts list."""
1✔
82

83
    step_proximity_score: float
84
    """Score based on proximity to the PartsList above (0.0-1.0).
1✔
85
    1.0 for closest proximity, 0.0 if very far. 0.0 if no parts list."""
86

87
    step_alignment_score: float
88
    """Score based on left-edge alignment with PartsList above (0.0-1.0).
1✔
89
    1.0 is perfect alignment, 0.0 is very misaligned. 0.0 if no parts list."""
90

91
    def score(self) -> Weight:
1✔
92
        """Return the overall pairing score."""
93
        return self.overall_score()
×
94

95
    def overall_score(self) -> float:
1✔
96
        """Calculate overall quality score based on parts list pairing.
97

98
        Steps with a parts_list are given a bonus to prefer them over
99
        steps without parts_list. Diagrams are found at build time, not
100
        during scoring, to allow rotation symbols to claim small images first.
101
        """
102
        if self.has_parts_list:
1✔
103
            # Base score for having parts_list + proximity/alignment bonus
104
            parts_list_bonus = 0.5
1✔
105
            pairing_score = (
1✔
106
                self.step_proximity_score + self.step_alignment_score
107
            ) / 2.0
108
            return parts_list_bonus + 0.5 * pairing_score
1✔
109
        return 0.3  # Lower base score for steps without parts list
1✔
110

111
    def sort_key(self) -> tuple[float, int]:
1✔
112
        """Return a tuple for sorting candidates.
113

114
        We prefer:
115
        1. Higher overall scores (better StepNumber-PartsList-Diagram match)
116
        2. Lower step number values (to break ties and maintain order)
117
        """
118
        return (-self.overall_score(), self.step_value)
1✔
119

120

121
class StepClassifier(LabelClassifier):
1✔
122
    """Classifier for complete Step structures."""
123

124
    output = "step"
1✔
125
    requires = frozenset(
1✔
126
        {
127
            "arrow",
128
            "step_number",
129
            "parts_list",
130
            "diagram",
131
            "rotation_symbol",
132
            "subassembly",
133
            "substep",
134
            "preview",
135
        }
136
    )
137

138
    def _score(self, result: ClassificationResult) -> None:
1✔
139
        """Score step pairings and create candidates."""
140
        page_data = result.page_data
1✔
141

142
        # Get step number and parts list candidates (not constructed elements)
143
        step_number_candidates = result.get_scored_candidates("step_number")
1✔
144

145
        if not step_number_candidates:
1✔
146
            return
1✔
147

148
        # Get parts_list candidates
149
        parts_list_candidates = result.get_scored_candidates("parts_list")
1✔
150

151
        log.debug(
1✔
152
            "[step] page=%s step_candidates=%d parts_list_candidates=%d",
153
            page_data.page_number,
154
            len(step_number_candidates),
155
            len(parts_list_candidates),
156
        )
157

158
        # Create all possible Step candidates for pairings (without diagrams initially)
159
        # Deduplication happens at build time, not scoring time, so that
160
        # diagram assignment can be re-evaluated as blocks get claimed.
161
        all_candidates: list[Candidate] = []
1✔
162
        for step_candidate in step_number_candidates:
1✔
163
            # Create candidates for this StepNumber paired with each PartsList
164
            for parts_list_candidate in parts_list_candidates:
1✔
165
                candidate = self._create_step_candidate(
1✔
166
                    step_candidate, parts_list_candidate, result
167
                )
168
                if candidate:
1✔
169
                    all_candidates.append(candidate)
1✔
170

171
            # Also create a candidate with no PartsList (fallback)
172
            candidate = self._create_step_candidate(step_candidate, None, result)
1✔
173
            if candidate:
1✔
174
                all_candidates.append(candidate)
1✔
175

176
        # Add all candidates to result (deduplication happens at build time)
177
        for candidate in all_candidates:
1✔
178
            result.add_candidate(candidate)
1✔
179

180
        log.debug(
1✔
181
            "[step] Created %d step candidates",
182
            len(all_candidates),
183
        )
184

185
    def build_all(self, result: ClassificationResult) -> list[LegoPageElements]:
1✔
186
        """Build all Step elements with coordinated element assignment.
187

188
        Build Order (phases run sequentially):
189
        - Phase 1: Rotation symbols - claim small circular arrow graphics
190
        - Phase 2: Parts lists - claim part count blocks and part images
191
        - Phase 3: Arrows - claim arrowhead triangles and shaft lines/rectangles
192
        - Phase 4: Subassemblies & Previews - claim white callout boxes
193
        - Phase 5: Steps - build step_number + parts_list pairs (no diagram yet)
194
        - Phase 6: Diagrams - build remaining unclaimed diagram regions
195
        - Phase 7: Assign diagrams to steps via Hungarian matching
196
        - Phase 8: Assign subassemblies to steps via Hungarian matching
197
        - Phase 9: Finalize steps - add arrows, compute final bboxes
198
        - Phase 10: Assign rotation symbols to steps via Hungarian matching
199

200
        The order matters because elements compete for the same Drawing blocks.
201
        Earlier phases claim blocks first, preventing later phases from using them.
202
        For example:
203
        - Arrows must build before subassemblies so arrow shafts aren't consumed
204
          by subassembly white boxes
205
        - Rotation symbols must build before diagrams so small circular graphics
206
          aren't incorrectly clustered into diagrams
207

208
        Deduplication of steps happens at build time (not scoring time) so that
209
        diagram assignment can be re-evaluated as blocks get claimed.
210

211
        Returns:
212
            List of successfully constructed Step elements, sorted by step number
213
        """
214
        # Pre-check: Get step candidates early to decide if we should build
215
        # supporting elements (rotation symbols, arrows, etc.)
216
        # On INFO pages with no steps, building these would create orphaned elements.
217
        all_step_candidates = result.get_scored_candidates("step")
1✔
218
        has_step_candidates = bool(all_step_candidates)
1✔
219

220
        # Phase 1: Build all rotation symbols BEFORE steps.
221
        # This allows rotation symbols to claim their Drawing blocks first,
222
        # preventing them from being incorrectly clustered into diagrams.
223
        #
224
        # IMPORTANT: Only build rotation symbols if there are step candidates.
225
        # On INFO pages, there are no steps to assign rotation symbols to.
226
        if has_step_candidates:
1✔
227
            for rs_candidate in result.get_scored_candidates("rotation_symbol"):
1✔
228
                try:
1✔
229
                    result.build(rs_candidate)
1✔
230
                    log.debug(
1✔
231
                        "[step] Built rotation symbol at %s (score=%.2f)",
232
                        rs_candidate.bbox,
233
                        rs_candidate.score,
234
                    )
235
                except Exception as e:
×
236
                    log.debug(
×
237
                        "[step] Failed to construct rotation_symbol "
238
                        "candidate at %s: %s",
239
                        rs_candidate.bbox,
240
                        e,
241
                    )
242
        else:
243
            log.debug("[step] Skipping rotation symbol build - no step candidates")
1✔
244

245
        # Phase 2: Build all parts lists BEFORE steps.
246
        # This allows parts lists to claim their Drawing blocks first,
247
        # preventing them from being claimed by subassemblies.
248
        for pl_candidate in result.get_scored_candidates("parts_list"):
1✔
249
            try:
1✔
250
                result.build(pl_candidate)
1✔
251
                log.debug(
1✔
252
                    "[step] Built parts_list at %s (score=%.2f)",
253
                    pl_candidate.bbox,
254
                    pl_candidate.score,
255
                )
256
            except Exception as e:
×
257
                log.debug(
×
258
                    "[step] Failed to construct parts_list candidate at %s: %s",
259
                    pl_candidate.bbox,
260
                    e,
261
                )
262

263
        # Phase 3: Build all arrows BEFORE subassemblies.
264
        # Arrows connect subassembly callout boxes to main diagrams. The arrow
265
        # shaft may be a thin rectangle that overlaps the subassembly's white
266
        # box. By building arrows first, they claim the shaft blocks before
267
        # subassemblies can consume them.
268
        #
269
        # IMPORTANT: Only build arrows if there are step candidates.
270
        # On INFO pages, there are no steps to assign arrows to, so building
271
        # them would create orphaned elements that trigger validation errors.
272
        # Note: has_step_candidates was computed in the pre-check.
273
        if has_step_candidates:
1✔
274
            for arrow_candidate in result.get_scored_candidates("arrow"):
1✔
275
                try:
1✔
276
                    result.build(arrow_candidate)
1✔
277
                    log.debug(
1✔
278
                        "[step] Built arrow at %s (score=%.2f)",
279
                        arrow_candidate.bbox,
280
                        arrow_candidate.score,
281
                    )
282
                except Exception as e:
×
283
                    log.debug(
×
284
                        "[step] Failed to construct arrow candidate at %s: %s",
285
                        arrow_candidate.bbox,
286
                        e,
287
                    )
288
        else:
289
            log.debug("[step] Skipping arrow build - no step candidates on page")
1✔
290

291
        # Phase 4: Build subassemblies and previews BEFORE steps.
292
        # Both subassemblies and previews are white boxes with diagrams inside.
293
        # We combine them and build in score order so the higher-scoring
294
        # candidate claims the white box first. When a candidate is built,
295
        # its source_blocks are marked as consumed, causing any competing
296
        # candidate using the same blocks to fail.
297
        #
298
        # This allows subassemblies (which have step_count labels like "2x")
299
        # to be distinguished from previews (which appear before steps).
300
        subassembly_candidates = result.get_scored_candidates("subassembly")
1✔
301
        preview_candidates = result.get_scored_candidates("preview")
1✔
302

303
        # Combine and sort by score (highest first)
304
        combined_candidates = list(subassembly_candidates) + list(preview_candidates)
1✔
305
        combined_candidates.sort(key=lambda c: c.score, reverse=True)
1✔
306

307
        for candidate in combined_candidates:
1✔
308
            try:
1✔
309
                result.build(candidate)
1✔
310
                log.debug(
1✔
311
                    "[step] Built %s at %s (score=%.2f)",
312
                    candidate.label,
313
                    candidate.bbox,
314
                    candidate.score,
315
                )
316
            except Exception as e:
1✔
317
                log.debug(
1✔
318
                    "[step] Failed to construct %s candidate at %s: %s",
319
                    candidate.label,
320
                    candidate.bbox,
321
                    e,
322
                )
323

324
        # Phase 5: Build Step candidates with deduplication by step value
325
        # Filter out candidates that look like substeps (small step numbers with
326
        # a significant gap from main step numbers). These are handled by
327
        # SubStepClassifier instead.
328
        # Steps are built as "partial" - just step_number + parts_list.
329
        # Diagrams and subassemblies are assigned later via Hungarian matching.
330
        # Note: all_step_candidates was already fetched in Phase 3.
331
        page_level_step_candidates = self._filter_page_level_step_candidates(
1✔
332
            all_step_candidates
333
        )
334

335
        # Sort by score (highest first) so best candidates get built first
336
        sorted_candidates = sorted(
1✔
337
            page_level_step_candidates,
338
            key=lambda c: c.score,
339
            reverse=True,
340
        )
341

342
        steps: list[Step] = []
1✔
343
        built_step_values: set[int] = set()
1✔
344

345
        for step_candidate in sorted_candidates:
1✔
346
            # Get step value from score
347
            score = step_candidate.score_details
1✔
348
            if not isinstance(score, _StepScore):
1✔
349
                continue
×
350

351
            # Skip if we've already built a step with this value
352
            if score.step_value in built_step_values:
1✔
353
                log.debug(
1✔
354
                    "[step] Skipping duplicate step %d candidate (score=%.2f)",
355
                    score.step_value,
356
                    step_candidate.score,
357
                )
358
                continue
1✔
359

360
            try:
1✔
361
                elem = result.build(step_candidate)
1✔
362
                assert isinstance(elem, Step)
1✔
363
                steps.append(elem)
1✔
364
                built_step_values.add(score.step_value)
1✔
365
                log.debug(
1✔
366
                    "[step] Built partial step %d (parts_list=%s, score=%.2f)",
367
                    score.step_value,
368
                    "yes" if score.parts_list_candidate is not None else "no",
369
                    step_candidate.score,
370
                )
371
            except Exception as e:
1✔
372
                log.debug(
1✔
373
                    "[step] Failed to construct step %d candidate: %s",
374
                    score.step_value,
375
                    e,
376
                )
377

378
        # Sort steps by their step_number value
379
        steps.sort(key=lambda step: step.step_number.value)
1✔
380

381
        # Phase 6: Build diagrams (not yet claimed by subassemblies)
382
        # Phase 7: Assign diagrams to steps using Hungarian matching
383
        # Collect available diagram candidates (not yet claimed by subassemblies)
384
        available_diagrams: list[Diagram] = []
1✔
385
        for diag_candidate in result.get_scored_candidates("diagram"):
1✔
386
            if diag_candidate.constructed is None:
1✔
387
                # Build the diagram
388
                try:
1✔
389
                    diag_elem = result.build(diag_candidate)
1✔
390
                    assert isinstance(diag_elem, Diagram)
1✔
391
                    available_diagrams.append(diag_elem)
1✔
392
                except Exception as e:
1✔
393
                    log.debug(
1✔
394
                        "[step] Failed to build diagram at %s: %s",
395
                        diag_candidate.bbox,
396
                        e,
397
                    )
398
            elif isinstance(diag_candidate.constructed, Diagram):
1✔
399
                # Already built (claimed by subassembly) - skip
400
                pass
1✔
401

402
        log.debug(
1✔
403
            "[step] Available diagrams for step assignment: %d",
404
            len(available_diagrams),
405
        )
406

407
        # Collect divider bboxes for obstruction checking (used by multiple phases)
408
        divider_bboxes: list[BBox] = []
1✔
409
        for div_candidate in result.get_built_candidates("divider"):
1✔
410
            assert div_candidate.constructed is not None
1✔
411
            assert isinstance(div_candidate.constructed, Divider)
1✔
412
            divider_bboxes.append(div_candidate.constructed.bbox)
1✔
413

414
        # Assign diagrams to steps using Hungarian matching
415
        # Dividers prevent pairing across page divisions
416
        assign_diagrams_to_steps(steps, available_diagrams, divider_bboxes)
1✔
417

418
        # Phase 8: Assign subassemblies to steps using Hungarian matching
419
        # Collect built subassemblies
420
        subassemblies: list[SubAssembly] = []
1✔
421
        for sa_candidate in result.get_built_candidates("subassembly"):
1✔
422
            assert sa_candidate.constructed is not None
1✔
423
            assert isinstance(sa_candidate.constructed, SubAssembly)
1✔
424
            subassemblies.append(sa_candidate.constructed)
1✔
425

426
        log.debug(
1✔
427
            "[step] Assigning %d subassemblies to %d steps (%d dividers for "
428
            "obstruction checking)",
429
            len(subassemblies),
430
            len(steps),
431
            len(divider_bboxes),
432
        )
433

434
        # Assign subassemblies to steps using Hungarian matching
435
        assign_subassemblies_to_steps(steps, subassemblies, divider_bboxes)
1✔
436

437
        # Phase 8b: Get unclaimed SubSteps as naked substeps
438
        # SubStepClassifier found small step number + diagram pairs.
439
        # SubAssemblyClassifier claimed those inside callout boxes.
440
        # The remaining ones become "naked" substeps for the main step.
441
        if steps:
1✔
442
            substeps = self._get_unclaimed_substeps(result)
1✔
443
            if substeps:
1✔
444
                # Assign all substeps to the single main step on the page
445
                # (In the future, we could use Hungarian matching if there are
446
                # multiple main steps, but for now this covers the common case)
447
                main_step = steps[0]  # Usually only one main step when substeps exist
1✔
448
                object.__setattr__(main_step, "substeps", substeps)
1✔
449
                log.debug(
1✔
450
                    "[step] Assigned %d substeps to step %d",
451
                    len(substeps),
452
                    main_step.step_number.value,
453
                )
454

455
        # Phase 9: Finalize steps - collect arrows and compute final bboxes
456
        page_bbox = result.page_data.bbox
1✔
457
        for step in steps:
1✔
458
            # Get arrows for this step (pass subassemblies for search region)
459
            arrows = self._get_arrows_for_step(
1✔
460
                step.step_number, step.diagram, step.subassemblies, result
461
            )
462

463
            # Compute final bbox including all components
464
            final_bbox = self._compute_step_bbox(
1✔
465
                step.step_number,
466
                step.parts_list,
467
                step.diagram,
468
                step.subassemblies,
469
                page_bbox,
470
            )
471

472
            # Update the step (Step is a Pydantic model, so we need to reassign)
473
            # We mutate in place by directly setting attributes
474
            object.__setattr__(step, "arrows", arrows)
1✔
475
            object.__setattr__(step, "bbox", final_bbox)
1✔
476

477
        # Phase 10: Assign rotation symbols to steps using Hungarian matching
478
        # Collect built rotation symbols
479
        rotation_symbols: list[RotationSymbol] = []
1✔
480
        for rs_candidate in result.get_built_candidates("rotation_symbol"):
1✔
481
            assert rs_candidate.constructed is not None
1✔
482
            assert isinstance(rs_candidate.constructed, RotationSymbol)
1✔
483
            rotation_symbols.append(rs_candidate.constructed)
1✔
484

485
        assign_rotation_symbols_to_steps(steps, rotation_symbols)
1✔
486

487
        log.debug(
1✔
488
            "[step] build_all complete: %d steps, %d rotation symbols assigned",
489
            len(steps),
490
            sum(1 for s in steps if s.rotation_symbol is not None),
491
        )
492

493
        # Cast for type compatibility with base class return type
494
        return list[LegoPageElements](steps)
1✔
495

496
    def _filter_page_level_step_candidates(
1✔
497
        self, candidates: list[Candidate]
498
    ) -> list[Candidate]:
499
        """Filter step candidates to exclude likely substep candidates.
500

501
        When a page has both low-numbered steps (1, 2, 3, 4) and high-numbered
502
        steps (338), with a significant gap between them, the low numbers are
503
        likely substeps rather than main steps. These are handled by
504
        SubStepClassifier instead.
505

506
        Args:
507
            candidates: All step candidates to filter
508

509
        Returns:
510
            Page-level step candidates (excluding substep candidates)
511
        """
512
        # Extract step number values from candidates
513
        items_with_values: list[tuple[int, Candidate]] = []
1✔
514
        for candidate in candidates:
1✔
515
            score = candidate.score_details
1✔
516
            if not isinstance(score, _StepScore):
1✔
517
                continue
×
518
            items_with_values.append((score.step_value, candidate))
1✔
519

520
        # Use generic filtering function to find page-level steps
521
        filtered_items = filter_subassembly_values(items_with_values)
1✔
522

523
        # If filtering occurred, return only page-level candidates
524
        if len(filtered_items) != len(items_with_values):
1✔
525
            page_level_values = {v for v, _ in filtered_items}
1✔
526
            page_level_candidates = [
1✔
527
                item for v, item in items_with_values if v in page_level_values
528
            ]
529
            log.debug(
1✔
530
                "[step] Filtered to page-level candidates: %s (excluded substeps: %s)",
531
                sorted(page_level_values),
532
                sorted(v for v, _ in items_with_values if v not in page_level_values),
533
            )
534
            return page_level_candidates
1✔
535

536
        # No filtering occurred, return all as page-level
537
        return candidates
1✔
538

539
    def _get_unclaimed_substeps(
1✔
540
        self,
541
        result: ClassificationResult,
542
    ) -> list[SubStep]:
543
        """Get SubStep elements that weren't claimed by a SubAssembly.
544

545
        SubStepClassifier creates candidates for all small step number +
546
        diagram pairs. SubAssemblyClassifier then claims those inside callout boxes.
547
        This method returns the remaining unclaimed SubSteps for use as
548
        "naked" substeps in a Step.
549

550
        The substeps are filtered to only include valid sequences starting from 1.
551
        This helps filter out PieceLengths or other misidentified elements.
552

553
        Args:
554
            result: Classification result
555

556
        Returns:
557
            List of SubStep elements that weren't claimed by any SubAssembly,
558
            sorted by step number value, and filtered to valid sequences from 1.
559
        """
560
        # Get all substep candidates
561
        substep_candidates = result.get_scored_candidates("substep")
1✔
562

563
        # Filter to only unclaimed candidates (not yet built)
564
        unclaimed_candidates = [c for c in substep_candidates if c.constructed is None]
1✔
565

566
        if not unclaimed_candidates:
1✔
567
            return []
1✔
568

569
        # Extract step values from candidates to filter BEFORE building
570
        def get_step_value(candidate: Candidate) -> int:
1✔
571
            """Get step value from a substep candidate's score."""
572
            score = candidate.score_details
1✔
573
            assert isinstance(score, _SubStepScore)
1✔
574

575
            return score.step_value
1✔
576

577
        # Filter candidates to valid sequential substeps (must start from 1)
578
        valid_candidates = filter_valid_substep_sequence(
1✔
579
            unclaimed_candidates, get_step_value
580
        )
581

582
        if not valid_candidates:
1✔
583
            log.debug("[step] No valid substep sequence found (no step 1)")
1✔
584
            return []
1✔
585

586
        # Now build only the filtered candidates
587
        substeps: list[SubStep] = []
1✔
588
        for candidate in valid_candidates:
1✔
589
            try:
1✔
590
                substep_elem = result.build(candidate)
1✔
591
                assert isinstance(substep_elem, SubStep)
1✔
592
                substeps.append(substep_elem)
1✔
593
            except CandidateFailedError:
1✔
594
                # Already claimed - skip
595
                continue
1✔
596

597
        # Sort by step number value
598
        substeps.sort(key=lambda s: s.step_number.value)
1✔
599

600
        log.debug(
1✔
601
            "[step] Found %d unclaimed substeps (after sequence filtering)",
602
            len(substeps),
603
        )
604

605
        return substeps
1✔
606

607
    def build(self, candidate: Candidate, result: ClassificationResult) -> Step:
1✔
608
        """Construct a partial Step element from a single candidate.
609

610
        This creates a Step with just step_number and parts_list. The diagram,
611
        subassemblies, and arrows are assigned later in build_all() using
612
        Hungarian matching for optimal global assignment.
613
        """
614
        score = candidate.score_details
1✔
615
        assert isinstance(score, _StepScore)
1✔
616

617
        # Validate and extract step number from parent candidate
618
        step_num_candidate = score.step_number_candidate
1✔
619

620
        step_num_elem = result.build(step_num_candidate)
1✔
621
        assert isinstance(step_num_elem, StepNumber)
1✔
622
        step_num = step_num_elem
1✔
623

624
        # Validate and extract parts list from parent candidate (if present)
625
        parts_list = None
1✔
626
        if score.parts_list_candidate:
1✔
627
            parts_list_candidate = score.parts_list_candidate
1✔
628
            parts_list_elem = result.build(parts_list_candidate)
1✔
629
            assert isinstance(parts_list_elem, PartsList)
1✔
630
            parts_list = parts_list_elem
1✔
631

632
        # Create partial step - diagram, subassemblies, arrows assigned later
633
        initial_bbox = step_num.bbox
1✔
634
        if parts_list:
1✔
635
            initial_bbox = initial_bbox.union(parts_list.bbox)
1✔
636

637
        return Step(
1✔
638
            bbox=initial_bbox,
639
            step_number=step_num,
640
            parts_list=parts_list,
641
            diagram=None,
642
            rotation_symbol=None,
643
            arrows=[],
644
            subassemblies=[],
645
        )
646

647
    def _find_and_build_diagram_for_step(
1✔
648
        self,
649
        step_num: StepNumber,
650
        parts_list: PartsList | None,
651
        result: ClassificationResult,
652
    ) -> Diagram | None:
653
        """Find and build the best diagram for this step.
654

655
        This is called at build time, after rotation symbols and subassemblies
656
        have been built (in build_all Phases 1 and 3), so they've already
657
        claimed any images/diagrams they need. We filter to only consider
658
        diagram candidates that haven't been constructed yet.
659

660
        Args:
661
            step_num: The built step number element
662
            parts_list: The built parts list element (if any)
663
            result: Classification result containing diagram candidates
664

665
        Returns:
666
            The built Diagram element, or None if no suitable diagram found
667
        """
668
        # Get all non-failed, non-constructed diagram candidates
669
        diagram_candidates = result.get_scored_candidates("diagram")
×
670

671
        # Filter to only candidates that haven't been built yet
672
        # (subassemblies and rotation symbols built earlier may have claimed some)
673
        available_candidates = [c for c in diagram_candidates if c.constructed is None]
×
674

675
        if not available_candidates:
×
676
            log.debug(
×
677
                "[step] No diagram candidates available for step %d",
678
                step_num.value,
679
            )
680
            return None
×
681

682
        # Score each candidate based on position relative to step
683
        step_bbox = step_num.bbox
×
684
        best_candidate = None
×
685
        best_score = -float("inf")
×
686

687
        for candidate in available_candidates:
×
688
            score = self._score_step_diagram_pair(step_bbox, candidate.bbox)
×
689

690
            log.debug(
×
691
                "[step] Diagram candidate at %s for step %d: score=%.2f",
692
                candidate.bbox,
693
                step_num.value,
694
                score,
695
            )
696

697
            if score > best_score:
×
698
                best_score = score
×
699
                best_candidate = candidate
×
700

701
        if best_candidate is None or best_score < 0.2:
×
702
            log.debug(
×
703
                "[step] No suitable diagram found for step %d (best_score=%.2f)",
704
                step_num.value,
705
                best_score,
706
            )
707
            return None
×
708

709
        # Build the diagram
710
        try:
×
711
            diagram_elem = result.build(best_candidate)
×
712
            assert isinstance(diagram_elem, Diagram)
×
713
            log.debug(
×
714
                "[step] Built diagram at %s for step %d (score=%.2f)",
715
                diagram_elem.bbox,
716
                step_num.value,
717
                best_score,
718
            )
719
            return diagram_elem
×
720
        except CandidateFailedError as e:
×
721
            log.debug(
×
722
                "[step] Failed to build diagram for step %d: %s",
723
                step_num.value,
724
                e,
725
            )
726
            return None
×
727

728
    def _create_step_candidate(
1✔
729
        self,
730
        step_candidate: Candidate,
731
        parts_list_candidate: Candidate | None,
732
        result: ClassificationResult,
733
    ) -> Candidate | None:
734
        """Create a Step candidate (without diagram assignment).
735

736
        Diagrams are found at build time, not during scoring, to allow
737
        rotation symbols to claim small images first.
738

739
        Args:
740
            step_candidate: The StepNumber candidate for this step
741
            parts_list_candidate: The PartsList candidate to pair with (or None)
742
            result: Classification result
743

744
        Returns:
745
            The created Candidate with score but no construction, or None if
746
            the step number value cannot be extracted.
747
        """
748
        ABOVE_EPS = 2.0  # Small epsilon for "above" check
1✔
749
        ALIGNMENT_THRESHOLD_MULTIPLIER = 1.0  # Max horizontal offset
1✔
750
        DISTANCE_THRESHOLD_MULTIPLIER = 1.0  # Max vertical distance
1✔
751

752
        # Extract step number value from the candidate's score
753
        score_details = step_candidate.score_details
1✔
754
        if not isinstance(score_details, StepNumberScore):
1✔
755
            return None
×
756
        step_value = score_details.step_value
1✔
757
        if step_value == 0:
1✔
758
            return None
×
759

760
        step_bbox = step_candidate.bbox
1✔
761
        parts_list_bbox = parts_list_candidate.bbox if parts_list_candidate else None
1✔
762

763
        # Calculate pairing scores if there's a parts_list above the step
764
        proximity_score = 0.0
1✔
765
        alignment_score = 0.0
1✔
766

767
        if (
1✔
768
            parts_list_bbox is not None
769
            and parts_list_bbox.y1 <= step_bbox.y0 + ABOVE_EPS
770
        ):
771
            # Calculate distance (how far apart vertically)
772
            distance = step_bbox.y0 - parts_list_bbox.y1
1✔
773

774
            # Calculate proximity score
775
            max_distance = step_bbox.height * DISTANCE_THRESHOLD_MULTIPLIER
1✔
776
            if max_distance > 0:
1✔
777
                proximity_score = max(0.0, 1.0 - (distance / max_distance))
1✔
778

779
            # Calculate alignment score (how well left edges align)
780
            max_alignment_diff = step_bbox.width * ALIGNMENT_THRESHOLD_MULTIPLIER
1✔
781
            left_diff = abs(parts_list_bbox.x0 - step_bbox.x0)
1✔
782
            if max_alignment_diff > 0:
1✔
783
                alignment_score = max(0.0, 1.0 - (left_diff / max_alignment_diff))
1✔
784

785
        # Create score object with candidate references
786
        # Diagrams are found at build time, not during scoring
787
        score = _StepScore(
1✔
788
            step_value=step_value,
789
            step_number_candidate=step_candidate,
790
            parts_list_candidate=parts_list_candidate,
791
            has_parts_list=parts_list_candidate is not None,
792
            step_proximity_score=proximity_score,
793
            step_alignment_score=alignment_score,
794
        )
795

796
        # Calculate combined bbox for the candidate (without diagram)
797
        combined_bbox = step_bbox
1✔
798
        if parts_list_bbox:
1✔
799
            combined_bbox = BBox.union(combined_bbox, parts_list_bbox)
1✔
800

801
        # Create candidate
802
        return Candidate(
1✔
803
            bbox=combined_bbox,
804
            label="step",
805
            score=score.overall_score(),
806
            score_details=score,
807
            source_blocks=[],
808
        )
809

810
    def _score_step_diagram_pair(
1✔
811
        self,
812
        step_bbox: BBox,
813
        diagram_bbox: BBox,
814
    ) -> float:
815
        """Score how well a diagram matches a step.
816

817
        Diagrams are typically positioned to the right of and/or below the step
818
        number. This method scores based on:
819
        - Horizontal position: prefer diagrams to the right, penalize left
820
        - Vertical position: prefer diagrams below the step header
821
        - Distance: closer is better
822

823
        Args:
824
            step_bbox: The step number bounding box
825
            diagram_bbox: The diagram bounding box to score
826

827
        Returns:
828
            Score between 0.0 and 1.0 (higher is better match)
829
        """
830
        # Reference point: bottom-right of step number
831
        ref_x = step_bbox.x1
×
832
        ref_y = step_bbox.y1
×
833

834
        # TODO Move all these constants into config, or make them adaptive?
835

836
        # Horizontal score
837
        # Diagrams to the right are preferred, but allow some overlap
838
        x_offset = diagram_bbox.x0 - ref_x
×
839

840
        if x_offset >= -50:
×
841
            # Diagram starts to the right or slightly overlapping - good
842
            # Score decreases slightly with distance to the right
843
            x_score = max(0.5, 1.0 - abs(x_offset) / 400.0)
×
844
        elif x_offset >= -200:
×
845
            # Diagram is moderately to the left - acceptable
846
            x_score = 0.3 + 0.2 * (1.0 + x_offset / 200.0)
×
847
        else:
848
            # Diagram is far to the left - poor match
849
            x_score = max(0.1, 0.3 + x_offset / 400.0)
×
850

851
        # Vertical score
852
        # Diagrams below the step header are preferred
853
        y_offset = diagram_bbox.y0 - ref_y
×
854

855
        if y_offset >= -30:
×
856
            # Diagram starts below or slightly overlapping - good
857
            # Score decreases with vertical distance
858
            y_score = max(0.3, 1.0 - abs(y_offset) / 300.0)
×
859
        elif y_offset >= -100:
×
860
            # Diagram is moderately above - less good but acceptable
861
            y_score = 0.2 + 0.3 * (1.0 + y_offset / 100.0)
×
862
        else:
863
            # Diagram is far above - poor match
864
            y_score = max(0.05, 0.2 + y_offset / 300.0)
×
865

866
        # Combined score - weight both dimensions equally
867
        score = 0.5 * x_score + 0.5 * y_score
×
868

869
        return score
×
870

871
    def _get_rotation_symbol_for_step(
1✔
872
        self,
873
        step_bbox: BBox,
874
        result: ClassificationResult,
875
    ) -> RotationSymbol | None:
876
        """Find an already-built rotation symbol within this step's area.
877

878
        Rotation symbols are built by PageClassifier before steps are processed.
879
        This method finds the already-built rotation symbol that falls within
880
        the step's bounding box.
881

882
        Args:
883
            step_bbox: The step's bounding box (including step_num, parts_list,
884
                and diagram)
885
            result: Classification result containing rotation symbol candidates
886

887
        Returns:
888
            Single RotationSymbol element for this step, or None if not found
889
        """
890
        # Get rotation_symbol candidates - they should already be built
891
        # by PageClassifier. Use get_built_candidates to only get successfully
892
        # constructed rotation symbols.
893
        rotation_symbol_candidates = result.get_built_candidates("rotation_symbol")
×
894

895
        log.debug(
×
896
            "[step] Looking for rotation symbols in step bbox %s, "
897
            "found %d built candidates",
898
            step_bbox,
899
            len(rotation_symbol_candidates),
900
        )
901

902
        if not rotation_symbol_candidates:
×
903
            return None
×
904

905
        # Find best-scoring rotation symbol overlapping the step's bbox
906
        overlapping_candidates = filter_overlapping(
×
907
            rotation_symbol_candidates, step_bbox
908
        )
909
        best_candidate = find_best_scoring(overlapping_candidates)
×
910

911
        if best_candidate and best_candidate.constructed is not None:
×
912
            rotation_symbol = best_candidate.constructed
×
913
            assert isinstance(rotation_symbol, RotationSymbol)
×
914
            log.debug(
×
915
                "[step] Found rotation symbol at %s (score=%.2f)",
916
                rotation_symbol.bbox,
917
                best_candidate.score,
918
            )
919
            return rotation_symbol
×
920

921
        log.debug("[step] No rotation symbol found in step bbox %s", step_bbox)
×
922
        return None
×
923

924
    def _get_arrows_for_step(
1✔
925
        self,
926
        step_num: StepNumber,
927
        diagram: Diagram | None,
928
        subassemblies: list[SubAssembly],
929
        result: ClassificationResult,
930
    ) -> list[Arrow]:
931
        """Collect already-built arrows that belong to this step.
932

933
        Arrows are built in Phase 3 of build_all(), before subassemblies.
934
        This method finds arrows that are positioned near the step's diagram
935
        and assigns them to this step.
936

937
        Typically these are arrows pointing from subassembly callout boxes
938
        to the main diagram. The search region includes both the diagram and
939
        any subassemblies assigned to this step.
940

941
        Args:
942
            step_num: The step number element
943
            diagram: The diagram element (if any)
944
            subassemblies: List of subassemblies assigned to this step
945
            result: Classification result containing arrow candidates
946

947
        Returns:
948
            List of Arrow elements for this step
949
        """
950
        # Get already-built arrows (built in Phase 3)
951
        arrow_candidates = result.get_built_candidates("arrow")
1✔
952

953
        log.debug(
1✔
954
            "[step] Looking for arrows for step %d, found %d built arrows",
955
            step_num.value,
956
            len(arrow_candidates),
957
        )
958

959
        if not arrow_candidates:
1✔
960
            return []
1✔
961

962
        # Build search region from diagram and subassemblies
963
        # Arrows typically connect subassemblies to the main diagram, so we need
964
        # to search both areas
965
        search_bboxes: list[BBox] = []
1✔
966
        if diagram:
1✔
967
            search_bboxes.append(diagram.bbox)
1✔
968
        for sa in subassemblies:
1✔
969
            search_bboxes.append(sa.bbox)
1✔
970

971
        # Fallback to step number bbox if no diagram or subassemblies
972
        if not search_bboxes:
1✔
973
            search_bboxes.append(step_num.bbox)
×
974

975
        # Compute union of all components and expand
976
        search_bbox = BBox.union_all(search_bboxes)
1✔
977

978
        # Expand search region to catch arrows near the components
979
        # Use a larger margin since arrows can extend further
980
        search_region = search_bbox.expand(100.0)
1✔
981

982
        log.debug(
1✔
983
            "[step] Arrow search region for step %d: %s",
984
            step_num.value,
985
            search_region,
986
        )
987

988
        # Find arrows within or overlapping the search region
989
        arrows: list[Arrow] = []
1✔
990
        overlapping_candidates = filter_overlapping(arrow_candidates, search_region)
1✔
991

992
        for candidate in overlapping_candidates:
1✔
993
            assert candidate.constructed is not None
1✔
994
            assert isinstance(candidate.constructed, Arrow)
1✔
995
            log.debug(
1✔
996
                "[step]   Arrow at %s overlaps search region (score=%.2f)",
997
                candidate.bbox,
998
                candidate.score,
999
            )
1000
            arrows.append(candidate.constructed)
1✔
1001

1002
        log.debug(
1✔
1003
            "[step] Found %d arrows for step %d",
1004
            len(arrows),
1005
            step_num.value,
1006
        )
1007
        return arrows
1✔
1008

1009
    def _get_subassemblies_for_step(
1✔
1010
        self,
1011
        step_num: StepNumber,
1012
        diagram: Diagram | None,
1013
        result: ClassificationResult,
1014
    ) -> list[SubAssembly]:
1015
        """Get already-built subassemblies that belong to this step.
1016

1017
        Subassemblies are built in build_all() before steps. This method finds
1018
        subassemblies that are positioned near this step's diagram and haven't
1019
        been claimed by another step yet.
1020

1021
        Args:
1022
            step_num: The step number element
1023
            diagram: The diagram element (if any)
1024
            result: Classification result containing subassembly candidates
1025

1026
        Returns:
1027
            List of SubAssembly elements for this step
1028
        """
1029
        # Get all built subassemblies
1030
        all_subassemblies: list[SubAssembly] = []
×
1031
        for sa_candidate in result.get_built_candidates("subassembly"):
×
1032
            assert sa_candidate.constructed is not None
×
1033
            assert isinstance(sa_candidate.constructed, SubAssembly)
×
1034
            all_subassemblies.append(sa_candidate.constructed)
×
1035

1036
        log.debug(
×
1037
            "[step] Looking for subassemblies for step %d, found %d built",
1038
            step_num.value,
1039
            len(all_subassemblies),
1040
        )
1041

1042
        if not all_subassemblies:
×
1043
            return []
×
1044

1045
        # Determine search region based on step_number and diagram
1046
        reference_bbox = diagram.bbox.union(step_num.bbox) if diagram else step_num.bbox
×
1047

1048
        page_bbox = result.page_data.bbox
×
1049

1050
        # Expand search region around the reference area
1051
        # Horizontally: search the full page width
1052
        # Vertically: search a region around the reference bbox
1053
        vertical_margin = reference_bbox.height
×
1054
        search_region = BBox(
×
1055
            x0=page_bbox.x0,
1056
            y0=max(page_bbox.y0, reference_bbox.y0 - vertical_margin),
1057
            x1=page_bbox.x1,
1058
            y1=min(page_bbox.y1, reference_bbox.y1 + vertical_margin),
1059
        )
1060

1061
        log.debug(
×
1062
            "[step] SubAssembly search region for step %d: %s",
1063
            step_num.value,
1064
            search_region,
1065
        )
1066

1067
        # Find subassemblies that overlap the search region
1068
        subassemblies: list[SubAssembly] = []
×
1069
        for subassembly in all_subassemblies:
×
1070
            if subassembly.bbox.overlaps(search_region):
×
1071
                log.debug(
×
1072
                    "[step]   SubAssembly at %s overlaps search region",
1073
                    subassembly.bbox,
1074
                )
1075
                subassemblies.append(subassembly)
×
1076

1077
        log.debug(
×
1078
            "[step] Found %d subassemblies for step %d",
1079
            len(subassemblies),
1080
            step_num.value,
1081
        )
1082
        return subassemblies
×
1083

1084
    def _compute_step_bbox(
1✔
1085
        self,
1086
        step_num: StepNumber,
1087
        parts_list: PartsList | None,
1088
        diagram: Diagram | None,
1089
        subassemblies: list[SubAssembly],
1090
        page_bbox: BBox,
1091
    ) -> BBox:
1092
        """Compute the overall bounding box for the Step.
1093

1094
        This encompasses the step number, parts list (if any), diagram (if any),
1095
        and all subassemblies.
1096
        The result is clipped to the page bounds to handle elements that extend
1097
        slightly off-page (e.g., arrows in diagrams).
1098

1099
        Args:
1100
            step_num: The step number element
1101
            parts_list: The parts list (if any)
1102
            diagram: The diagram element (if any)
1103
            subassemblies: List of subassemblies for this step
1104
            page_bbox: The page bounding box to clip to
1105

1106
        Returns:
1107
            Combined bounding box, clipped to page bounds
1108
        """
1109
        bboxes = [step_num.bbox]
1✔
1110
        if parts_list:
1✔
1111
            bboxes.append(parts_list.bbox)
1✔
1112
        if diagram:
1✔
1113
            bboxes.append(diagram.bbox)
1✔
1114
        for sa in subassemblies:
1✔
1115
            bboxes.append(sa.bbox)
1✔
1116

1117
        return BBox.union_all(bboxes).clip_to(page_bbox)
1✔
1118

1119

1120
def assign_diagrams_to_steps(
1✔
1121
    steps: list[Step],
1122
    diagrams: list[Diagram],
1123
    divider_bboxes: Sequence[BBox] = (),
1124
    max_distance: float = 500.0,
1125
) -> None:
1126
    """Assign diagrams to steps using Hungarian algorithm.
1127

1128
    Uses the shared pairing module to find optimal step number to diagram
1129
    pairings based on:
1130
    - Position: step number should be in top-left of diagram
1131
    - Distance: closer is better
1132
    - Dividers: pairings should not cross divider lines
1133

1134
    This function mutates the Step objects in place, setting their diagram
1135
    attribute.
1136

1137
    Args:
1138
        steps: List of Step objects to assign diagrams to
1139
        diagrams: List of Diagram objects to assign
1140
        divider_bboxes: Sequence of divider bounding boxes to check for crossing
1141
        max_distance: Maximum distance for a valid assignment.
1142
    """
1143
    if not steps or not diagrams:
1✔
1144
        log.debug(
1✔
1145
            "[step] No diagram assignment needed: steps=%d, diagrams=%d",
1146
            len(steps),
1147
            len(diagrams),
1148
        )
1149
        return
1✔
1150

1151
    log.debug(
1✔
1152
        "[step] Running Hungarian matching for diagrams: %d steps, %d diagrams",
1153
        len(steps),
1154
        len(diagrams),
1155
    )
1156

1157
    # Configure pairing
1158
    config = PairingConfig(
1✔
1159
        max_distance=max_distance,
1160
        position_weight=0.5,
1161
        distance_weight=0.5,
1162
        check_dividers=True,
1163
        top_left_tolerance=100.0,
1164
    )
1165

1166
    # Extract bboxes
1167
    step_bboxes = [s.step_number.bbox for s in steps]
1✔
1168
    diagram_bboxes = [d.bbox for d in diagrams]
1✔
1169

1170
    # Find optimal pairings using shared logic
1171
    pairings = find_optimal_pairings(
1✔
1172
        step_bboxes, diagram_bboxes, config, divider_bboxes
1173
    )
1174

1175
    # Assign diagrams to steps based on the matching
1176
    for pairing in pairings:
1✔
1177
        step = steps[pairing.step_index]
1✔
1178
        diag = diagrams[pairing.diagram_index]
1✔
1179
        # Use object.__setattr__ because Step is a frozen Pydantic model
1180
        object.__setattr__(step, "diagram", diag)
1✔
1181
        log.debug(
1✔
1182
            "[step] Assigned diagram at %s to step %d (cost=%.2f)",
1183
            diag.bbox,
1184
            step.step_number.value,
1185
            pairing.cost,
1186
        )
1187

1188

1189
def assign_subassemblies_to_steps(
1✔
1190
    steps: list[Step],
1191
    subassemblies: list[SubAssembly],
1192
    divider_bboxes: Sequence[BBox] = (),
1193
    max_distance: float = 400.0,
1194
) -> None:
1195
    """Assign subassemblies to steps using Hungarian algorithm.
1196

1197
    Uses optimal bipartite matching to pair subassemblies with steps based on:
1198
    - Distance from step's diagram to subassembly (closer is better)
1199
    - No divider between the subassembly and the step's diagram
1200

1201
    This function mutates the Step objects in place, adding to their
1202
    subassemblies list.
1203

1204
    Args:
1205
        steps: List of Step objects to assign subassemblies to
1206
        subassemblies: List of SubAssembly objects to assign
1207
        divider_bboxes: Sequence of divider bounding boxes for obstruction checking
1208
        max_distance: Maximum distance for a valid assignment
1209
    """
1210
    if not steps or not subassemblies:
1✔
1211
        log.debug(
1✔
1212
            "[step] No subassembly assignment needed: steps=%d, subassemblies=%d",
1213
            len(steps),
1214
            len(subassemblies),
1215
        )
1216
        return
1✔
1217

1218
    n_steps = len(steps)
1✔
1219
    n_subassemblies = len(subassemblies)
1✔
1220

1221
    log.debug(
1✔
1222
        "[step] Running Hungarian matching for subassemblies: "
1223
        "%d steps, %d subassemblies",
1224
        n_steps,
1225
        n_subassemblies,
1226
    )
1227

1228
    # Build cost matrix: rows = subassemblies, cols = steps
1229
    cost_matrix = np.zeros((n_subassemblies, n_steps))
1✔
1230
    high_cost = max_distance * 10
1✔
1231

1232
    for i, sa in enumerate(subassemblies):
1✔
1233
        sa_center = sa.bbox.center
1✔
1234
        for j, step in enumerate(steps):
1✔
1235
            # Use diagram center if available, otherwise step bbox center
1236
            if step.diagram:
1✔
1237
                target_bbox = step.diagram.bbox
1✔
1238
                target_center = target_bbox.center
1✔
1239
            else:
1240
                target_bbox = step.bbox
×
1241
                target_center = step.step_number.bbox.center
×
1242

1243
            # Base cost is distance from diagram/step to subassembly center
1244
            distance = (
1✔
1245
                (target_center[0] - sa_center[0]) ** 2
1246
                + (target_center[1] - sa_center[1]) ** 2
1247
            ) ** 0.5
1248

1249
            # Check for divider between subassembly and step's diagram
1250
            if has_divider_between(sa.bbox, target_bbox, divider_bboxes):
1✔
1251
                # High cost if there's a divider between
1252
                distance = high_cost
1✔
1253
                log.debug(
1✔
1254
                    "[step]   Divider between subassembly[%d] at %s and step[%d]",
1255
                    i,
1256
                    sa.bbox,
1257
                    step.step_number.value,
1258
                )
1259

1260
            cost_matrix[i, j] = distance
1✔
1261
            log.debug(
1✔
1262
                "[step]   Cost subassembly[%d] at %s to step[%d]: %.1f",
1263
                i,
1264
                sa.bbox,
1265
                step.step_number.value,
1266
                distance,
1267
            )
1268

1269
    # Apply max_distance threshold
1270
    cost_matrix_thresholded = np.where(
1✔
1271
        cost_matrix > max_distance, high_cost, cost_matrix
1272
    )
1273

1274
    # Run Hungarian algorithm
1275
    row_indices, col_indices = linear_sum_assignment(cost_matrix_thresholded)
1✔
1276

1277
    # Assign subassemblies to steps based on the matching
1278
    # Note: Unlike diagrams, multiple subassemblies can be assigned to one step
1279
    # The Hungarian algorithm gives us the best 1:1 matching, but we may want
1280
    # to assign additional nearby subassemblies too
1281

1282
    # First, do the 1:1 optimal assignment
1283
    assigned_subassemblies: set[int] = set()
1✔
1284
    for row_idx, col_idx in zip(row_indices, col_indices, strict=True):
1✔
1285
        if cost_matrix[row_idx, col_idx] <= max_distance:
1✔
1286
            sa = subassemblies[row_idx]
1✔
1287
            step = steps[col_idx]
1✔
1288
            # Add to step's subassemblies list
1289
            new_subassemblies = list(step.subassemblies) + [sa]
1✔
1290
            object.__setattr__(step, "subassemblies", new_subassemblies)
1✔
1291
            assigned_subassemblies.add(row_idx)
1✔
1292
            log.debug(
1✔
1293
                "[step] Assigned subassembly at %s to step %d (cost=%.1f)",
1294
                sa.bbox,
1295
                step.step_number.value,
1296
                cost_matrix[row_idx, col_idx],
1297
            )
1298

1299
    # Then, try to assign remaining unassigned subassemblies to their nearest step
1300
    # (if within max_distance and no divider between)
1301
    for i, sa in enumerate(subassemblies):
1✔
1302
        if i in assigned_subassemblies:
1✔
1303
            continue
1✔
1304

1305
        # Find the step with lowest cost for this subassembly
1306
        best_step_idx = None
1✔
1307
        best_cost = high_cost
1✔
1308
        for j in range(len(steps)):
1✔
1309
            if cost_matrix[i, j] < best_cost:
1✔
1310
                best_cost = cost_matrix[i, j]
1✔
1311
                best_step_idx = j
1✔
1312

1313
        if best_step_idx is not None and best_cost <= max_distance:
1✔
1314
            step = steps[best_step_idx]
1✔
1315
            new_subassemblies = list(step.subassemblies) + [sa]
1✔
1316
            object.__setattr__(step, "subassemblies", new_subassemblies)
1✔
1317
            log.debug(
1✔
1318
                "[step] Assigned extra subassembly at %s to step %d (cost=%.1f)",
1319
                sa.bbox,
1320
                step.step_number.value,
1321
                best_cost,
1322
            )
1323

1324

1325
def assign_rotation_symbols_to_steps(
1✔
1326
    steps: list[Step],
1327
    rotation_symbols: list[RotationSymbol],
1328
    max_distance: float = 300.0,
1329
) -> None:
1330
    """Assign rotation symbols to steps using Hungarian algorithm.
1331

1332
    Uses optimal bipartite matching to pair rotation symbols with steps based on
1333
    minimum distance from the rotation symbol to the step's diagram (or step bbox
1334
    center if no diagram).
1335

1336
    This function mutates the Step objects in place, setting their rotation_symbol
1337
    attribute.
1338

1339
    Args:
1340
        steps: List of Step objects to assign rotation symbols to
1341
        rotation_symbols: List of RotationSymbol objects to assign
1342
        max_distance: Maximum distance for a valid assignment. Pairs with distance
1343
            greater than this will not be matched.
1344
    """
1345
    if not steps or not rotation_symbols:
1✔
1346
        log.debug(
1✔
1347
            "[step] No rotation symbol assignment needed: "
1348
            "steps=%d, rotation_symbols=%d",
1349
            len(steps),
1350
            len(rotation_symbols),
1351
        )
1352
        return
1✔
1353

1354
    n_steps = len(steps)
1✔
1355
    n_symbols = len(rotation_symbols)
1✔
1356

1357
    log.debug(
1✔
1358
        "[step] Running Hungarian matching: %d steps, %d rotation symbols",
1359
        n_steps,
1360
        n_symbols,
1361
    )
1362

1363
    # Build cost matrix: rows = rotation symbols, cols = steps
1364
    # Cost = distance from rotation symbol center to step's diagram center
1365
    # (or step bbox center if no diagram)
1366
    cost_matrix = np.zeros((n_symbols, n_steps))
1✔
1367

1368
    for i, rs in enumerate(rotation_symbols):
1✔
1369
        rs_center = rs.bbox.center
1✔
1370
        for j, step in enumerate(steps):
1✔
1371
            # Use diagram center if available, otherwise step bbox center
1372
            if step.diagram:
1✔
1373
                target_center = step.diagram.bbox.center
1✔
1374
            else:
1375
                target_center = step.bbox.center
×
1376

1377
            # Euclidean distance
1378
            distance = (
1✔
1379
                (rs_center[0] - target_center[0]) ** 2
1380
                + (rs_center[1] - target_center[1]) ** 2
1381
            ) ** 0.5
1382

1383
            cost_matrix[i, j] = distance
1✔
1384
            log.debug(
1✔
1385
                "[step]   Distance from rotation_symbol[%d] at %s to step[%d]: %.1f",
1386
                i,
1387
                rs.bbox,
1388
                step.step_number.value,
1389
                distance,
1390
            )
1391

1392
    # Apply max_distance threshold by setting costs above threshold to a high value
1393
    # This prevents assignments that are too far apart
1394
    high_cost = max_distance * 10  # Arbitrary large value
1✔
1395
    cost_matrix_thresholded = np.where(
1✔
1396
        cost_matrix > max_distance, high_cost, cost_matrix
1397
    )
1398

1399
    # Run Hungarian algorithm
1400
    row_indices, col_indices = linear_sum_assignment(cost_matrix_thresholded)
1✔
1401

1402
    # Assign rotation symbols to steps based on the matching
1403
    for row_idx, col_idx in zip(row_indices, col_indices, strict=True):
1✔
1404
        # Check if this assignment is within the max_distance threshold
1405
        if cost_matrix[row_idx, col_idx] <= max_distance:
1✔
1406
            rs = rotation_symbols[row_idx]
1✔
1407
            step = steps[col_idx]
1✔
1408
            step.rotation_symbol = rs
1✔
1409
            # Expand step bbox to include the rotation symbol
1410
            step.bbox = step.bbox.union(rs.bbox)
1✔
1411
            log.debug(
1✔
1412
                "[step] Assigned rotation symbol at %s to step %d "
1413
                "(distance=%.1f, new step bbox=%s)",
1414
                rs.bbox,
1415
                step.step_number.value,
1416
                cost_matrix[row_idx, col_idx],
1417
                step.bbox,
1418
            )
1419
        else:
1420
            log.debug(
×
1421
                "[step] Skipped assignment: rotation_symbol[%d] to step[%d] "
1422
                "distance=%.1f > max_distance=%.1f",
1423
                row_idx,
1424
                col_indices[row_idx] if row_idx < len(col_indices) else -1,
1425
                cost_matrix[row_idx, col_idx],
1426
                max_distance,
1427
            )
1428

1429

1430
def filter_subassembly_values[T](
1✔
1431
    items: list[tuple[int, T]],
1432
    *,
1433
    min_gap: int = 3,
1434
    max_subassembly_start: int = 3,
1435
) -> list[tuple[int, T]]:
1436
    """Filter items to exclude likely subassembly values.
1437

1438
    Subassembly steps (e.g., step 1, 2 inside a subassembly box) often appear
1439
    alongside higher-numbered page-level steps (e.g., 15, 16). This creates
1440
    sequences like [1, 2, 15, 16] which cannot all be page-level steps.
1441

1442
    This function identifies such cases by detecting a significant gap in values
1443
    and filtering out the lower-numbered items that are likely subassembly steps.
1444

1445
    Args:
1446
        items: List of (value, item) tuples where value is the step number
1447
            and item is any associated data (e.g., a Candidate).
1448
        min_gap: Minimum gap size to trigger filtering (exclusive).
1449
            Default is 3, so gaps > 3 trigger filtering.
1450
        max_subassembly_start: Maximum starting value for subassembly detection.
1451
            If min value is <= this, filtering is applied. Default is 3.
1452

1453
    Returns:
1454
        Filtered list of (value, item) tuples, keeping only the higher values
1455
        if a subassembly pattern is detected. Returns original list if no
1456
        filtering is needed.
1457

1458
    Examples:
1459
        >>> filter_subassembly_values([(1, "a"), (2, "b"), (15, "c"), (16, "d")])
1460
        [(15, "c"), (16, "d")]
1461

1462
        >>> filter_subassembly_values([(5, "a"), (6, "b"), (15, "c")])
1463
        [(5, "a"), (6, "b"), (15, "c")]  # min=5 > 3, no filtering
1464
    """
1465
    if len(items) <= 1:
1✔
1466
        return items
1✔
1467

1468
    # Sort by value
1469
    sorted_items = sorted(items, key=lambda x: x[0])
1✔
1470
    values = [v for v, _ in sorted_items]
1✔
1471

1472
    # Find the largest gap between consecutive values
1473
    max_gap_size = 0
1✔
1474
    max_gap_index = -1
1✔
1475
    for i in range(len(values) - 1):
1✔
1476
        gap = values[i + 1] - values[i]
1✔
1477
        if gap > max_gap_size:
1✔
1478
            max_gap_size = gap
1✔
1479
            max_gap_index = i
1✔
1480

1481
    # Check if filtering should occur:
1482
    # 1. Gap must be larger than min_gap
1483
    # 2. The minimum value must be <= max_subassembly_start
1484
    if max_gap_size > min_gap and max_gap_index >= 0:
1✔
1485
        min_value = values[0]
1✔
1486
        if min_value <= max_subassembly_start:
1✔
1487
            threshold = values[max_gap_index + 1]
1✔
1488
            return [(v, item) for v, item in sorted_items if v >= threshold]
1✔
1489

1490
    return items
1✔
1491

1492

1493
def filter_valid_substep_sequence[T](
1✔
1494
    items: list[T],
1495
    get_value: Callable[[T], int],
1496
) -> list[T]:
1497
    """Filter substeps to only include valid sequential values starting from 1.
1498

1499
    Substeps within a subassembly or page must form a valid sequence:
1500
    - Must start from 1
1501
    - Values must be consecutive (1, 2, 3, ... not 1, 3, 5)
1502

1503
    This filters out isolated numbers that are likely PieceLengths or other
1504
    misidentified elements.
1505

1506
    Args:
1507
        items: List of substep items to filter
1508
        get_value: Function to extract the step number value from an item
1509

1510
    Returns:
1511
        Filtered list containing only items that form a valid sequence from 1.
1512
        Returns empty list if no valid sequence starting from 1 is found.
1513

1514
    Examples:
1515
        >>> filter_valid_substep_sequence([1, 2, 3], lambda x: x)
1516
        [1, 2, 3]
1517

1518
        >>> filter_valid_substep_sequence([3, 5], lambda x: x)
1519
        []  # No item with value 1
1520

1521
        >>> filter_valid_substep_sequence([1, 2, 5], lambda x: x)
1522
        [1, 2]  # 5 breaks the sequence
1523
    """
1524
    if not items:
1✔
1525
        return []
×
1526

1527
    # Build a map from value to items
1528
    value_to_items: dict[int, list[T]] = {}
1✔
1529
    for item in items:
1✔
1530
        value = get_value(item)
1✔
1531
        if value not in value_to_items:
1✔
1532
            value_to_items[value] = []
1✔
1533
        value_to_items[value].append(item)
1✔
1534

1535
    # Must have step 1 to be a valid sequence
1536
    if 1 not in value_to_items:
1✔
1537
        log.debug("[substep_filter] No substep with value 1 found - rejecting all")
1✔
1538
        return []
1✔
1539

1540
    # Collect items in sequence starting from 1
1541
    result: list[T] = []
1✔
1542
    expected_value = 1
1✔
1543

1544
    while expected_value in value_to_items:
1✔
1545
        result.extend(value_to_items[expected_value])
1✔
1546
        expected_value += 1
1✔
1547

1548
    log.debug(
1✔
1549
        "[substep_filter] Filtered %d items to %d valid sequential substeps (1-%d)",
1550
        len(items),
1551
        len(result),
1552
        expected_value - 1,
1553
    )
1554

1555
    return result
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc