• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 20361865516

19 Dec 2025 06:25AM UTC coverage: 89.13% (-0.002%) from 89.132%
20361865516

push

github

bramp
Fix lint errors: line length, unused imports, and YAML issues

- Add ruff isort configuration with known-first-party for build_a_long
- Add per-file E501 ignore for legocom_test.py (JSON test data)
- Create .yamllint config to relax strict YAML rules
- Fix E501 line length errors by wrapping long comments and strings
- Fix F841 unused variable errors
- Fix PLC0415 import-at-non-top-level errors
- Fix SIM108 ternary simplification errors

12 of 14 new or added lines in 8 files covered. (85.71%)

78 existing lines in 6 files now uncovered.

12915 of 14490 relevant lines covered (89.13%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

80.8
/src/build_a_long/pdf_extract/classifier/steps/step_classifier.py
1
"""
2
Step classifier.
3

4
Purpose
5
-------
6
Identify complete Step structures by combining step_number, parts_list, and diagram
7
elements. A Step represents a single building instruction comprising:
8
- A StepNumber label
9
- An optional PartsList (the parts needed for this step)
10
- A Diagram (the main instruction graphic showing what to build)
11

12
We look for step_numbers and attempt to pair them with nearby parts_lists and
13
identify the appropriate diagram region for each step.
14

15
Debugging
16
---------
17
Set environment variables to aid investigation without code changes:
18

19
- LOG_LEVEL=DEBUG
20
    Enables DEBUG-level logging (if not already configured by caller).
21
"""
22

23
import logging
1✔
24
from collections.abc import Callable, Sequence
1✔
25

26
import numpy as np
1✔
27
from scipy.optimize import linear_sum_assignment
1✔
28

29
from build_a_long.pdf_extract.classifier.candidate import Candidate
1✔
30
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
31
    CandidateFailedError,
32
    ClassificationResult,
33
)
34
from build_a_long.pdf_extract.classifier.label_classifier import (
1✔
35
    LabelClassifier,
36
)
37
from build_a_long.pdf_extract.classifier.rule_based_classifier import (
1✔
38
    StepNumberScore,
39
)
40
from build_a_long.pdf_extract.classifier.score import (
1✔
41
    Score,
42
    Weight,
43
    find_best_scoring,
44
)
45
from build_a_long.pdf_extract.classifier.steps.pairing import (
1✔
46
    PairingConfig,
47
    find_optimal_pairings,
48
    has_divider_between,
49
)
50
from build_a_long.pdf_extract.classifier.steps.substep_classifier import _SubStepScore
1✔
51
from build_a_long.pdf_extract.extractor.bbox import BBox, filter_overlapping
1✔
52
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
53
    Arrow,
54
    Diagram,
55
    Divider,
56
    LegoPageElements,
57
    PartsList,
58
    RotationSymbol,
59
    Step,
60
    StepNumber,
61
    SubAssembly,
62
    SubStep,
63
)
64

65
log = logging.getLogger(__name__)
1✔
66

67

68
class _StepScore(Score):
1✔
69
    """Internal score representation for step classification."""
70

71
    step_value: int
72
    """The parsed step number value (e.g., 1, 2, 3)."""
1✔
73

74
    step_number_candidate: Candidate
75
    """The step number candidate this step is associated with."""
1✔
76

77
    parts_list_candidate: Candidate | None
78
    """The parts list candidate paired with this step (if any)."""
1✔
79

80
    has_parts_list: bool
81
    """Whether this step has an associated parts list."""
1✔
82

83
    step_proximity_score: float
84
    """Score based on proximity to the PartsList above (0.0-1.0).
1✔
85
    1.0 for closest proximity, 0.0 if very far. 0.0 if no parts list."""
86

87
    step_alignment_score: float
88
    """Score based on left-edge alignment with PartsList above (0.0-1.0).
1✔
89
    1.0 is perfect alignment, 0.0 is very misaligned. 0.0 if no parts list."""
90

91
    def score(self) -> Weight:
1✔
92
        """Return the overall pairing score."""
93
        return self.overall_score()
×
94

95
    def overall_score(self) -> float:
1✔
96
        """Calculate overall quality score based on parts list pairing.
97

98
        Steps with a parts_list are given a bonus to prefer them over
99
        steps without parts_list. Diagrams are found at build time, not
100
        during scoring, to allow rotation symbols to claim small images first.
101
        """
102
        if self.has_parts_list:
1✔
103
            # Base score for having parts_list + proximity/alignment bonus
104
            parts_list_bonus = 0.5
1✔
105
            pairing_score = (
1✔
106
                self.step_proximity_score + self.step_alignment_score
107
            ) / 2.0
108
            return parts_list_bonus + 0.5 * pairing_score
1✔
109
        return 0.3  # Lower base score for steps without parts list
1✔
110

111
    def sort_key(self) -> tuple[float, int]:
1✔
112
        """Return a tuple for sorting candidates.
113

114
        We prefer:
115
        1. Higher overall scores (better StepNumber-PartsList-Diagram match)
116
        2. Lower step number values (to break ties and maintain order)
117
        """
118
        return (-self.overall_score(), self.step_value)
1✔
119

120

121
class StepClassifier(LabelClassifier):
1✔
122
    """Classifier for complete Step structures."""
123

124
    output = "step"
1✔
125
    requires = frozenset(
1✔
126
        {
127
            "arrow",
128
            "step_number",
129
            "parts_list",
130
            "diagram",
131
            "rotation_symbol",
132
            "subassembly",
133
            "substep",
134
            "preview",
135
        }
136
    )
137

138
    def _score(self, result: ClassificationResult) -> None:
1✔
139
        """Score step pairings and create candidates."""
140
        page_data = result.page_data
1✔
141

142
        # Get step number and parts list candidates (not constructed elements)
143
        step_number_candidates = result.get_scored_candidates(
1✔
144
            "step_number", valid_only=False, exclude_failed=True
145
        )
146

147
        if not step_number_candidates:
1✔
148
            return
1✔
149

150
        # Get parts_list candidates
151
        parts_list_candidates = result.get_scored_candidates(
1✔
152
            "parts_list",
153
            valid_only=False,
154
            exclude_failed=True,
155
        )
156

157
        log.debug(
1✔
158
            "[step] page=%s step_candidates=%d parts_list_candidates=%d",
159
            page_data.page_number,
160
            len(step_number_candidates),
161
            len(parts_list_candidates),
162
        )
163

164
        # Create all possible Step candidates for pairings (without diagrams initially)
165
        # Deduplication happens at build time, not scoring time, so that
166
        # diagram assignment can be re-evaluated as blocks get claimed.
167
        all_candidates: list[Candidate] = []
1✔
168
        for step_candidate in step_number_candidates:
1✔
169
            # Create candidates for this StepNumber paired with each PartsList
170
            for parts_list_candidate in parts_list_candidates:
1✔
171
                candidate = self._create_step_candidate(
1✔
172
                    step_candidate, parts_list_candidate, result
173
                )
174
                if candidate:
1✔
175
                    all_candidates.append(candidate)
1✔
176

177
            # Also create a candidate with no PartsList (fallback)
178
            candidate = self._create_step_candidate(step_candidate, None, result)
1✔
179
            if candidate:
1✔
180
                all_candidates.append(candidate)
1✔
181

182
        # Add all candidates to result (deduplication happens at build time)
183
        for candidate in all_candidates:
1✔
184
            result.add_candidate(candidate)
1✔
185

186
        log.debug(
1✔
187
            "[step] Created %d step candidates",
188
            len(all_candidates),
189
        )
190

191
    def build_all(self, result: ClassificationResult) -> list[LegoPageElements]:
1✔
192
        """Build all Step elements with coordinated element assignment.
193

194
        Build Order (phases run sequentially):
195
        - Phase 1: Rotation symbols - claim small circular arrow graphics
196
        - Phase 2: Parts lists - claim part count blocks and part images
197
        - Phase 3: Arrows - claim arrowhead triangles and shaft lines/rectangles
198
        - Phase 4: Subassemblies & Previews - claim white callout boxes
199
        - Phase 5: Steps - build step_number + parts_list pairs (no diagram yet)
200
        - Phase 6: Diagrams - build remaining unclaimed diagram regions
201
        - Phase 7: Assign diagrams to steps via Hungarian matching
202
        - Phase 8: Assign subassemblies to steps via Hungarian matching
203
        - Phase 9: Finalize steps - add arrows, compute final bboxes
204
        - Phase 10: Assign rotation symbols to steps via Hungarian matching
205

206
        The order matters because elements compete for the same Drawing blocks.
207
        Earlier phases claim blocks first, preventing later phases from using them.
208
        For example:
209
        - Arrows must build before subassemblies so arrow shafts aren't consumed
210
          by subassembly white boxes
211
        - Rotation symbols must build before diagrams so small circular graphics
212
          aren't incorrectly clustered into diagrams
213

214
        Deduplication of steps happens at build time (not scoring time) so that
215
        diagram assignment can be re-evaluated as blocks get claimed.
216

217
        Returns:
218
            List of successfully constructed Step elements, sorted by step number
219
        """
220
        # Pre-check: Get step candidates early to decide if we should build
221
        # supporting elements (rotation symbols, arrows, etc.)
222
        # On INFO pages with no steps, building these would create orphaned elements.
223
        all_step_candidates = result.get_scored_candidates(
1✔
224
            "step", valid_only=False, exclude_failed=True
225
        )
226
        has_step_candidates = bool(all_step_candidates)
1✔
227

228
        # Phase 1: Build all rotation symbols BEFORE steps.
229
        # This allows rotation symbols to claim their Drawing blocks first,
230
        # preventing them from being incorrectly clustered into diagrams.
231
        #
232
        # IMPORTANT: Only build rotation symbols if there are step candidates.
233
        # On INFO pages, there are no steps to assign rotation symbols to.
234
        if has_step_candidates:
1✔
235
            for rs_candidate in result.get_scored_candidates(
1✔
236
                "rotation_symbol", valid_only=False, exclude_failed=True
237
            ):
238
                try:
1✔
239
                    result.build(rs_candidate)
1✔
240
                    log.debug(
1✔
241
                        "[step] Built rotation symbol at %s (score=%.2f)",
242
                        rs_candidate.bbox,
243
                        rs_candidate.score,
244
                    )
UNCOV
245
                except Exception as e:
×
UNCOV
246
                    log.debug(
×
247
                        "[step] Failed to construct rotation_symbol "
248
                        "candidate at %s: %s",
249
                        rs_candidate.bbox,
250
                        e,
251
                    )
252
        else:
253
            log.debug("[step] Skipping rotation symbol build - no step candidates")
1✔
254

255
        # Phase 2: Build all parts lists BEFORE steps.
256
        # This allows parts lists to claim their Drawing blocks first,
257
        # preventing them from being claimed by subassemblies.
258
        for pl_candidate in result.get_scored_candidates(
1✔
259
            "parts_list", valid_only=False, exclude_failed=True
260
        ):
261
            try:
1✔
262
                result.build(pl_candidate)
1✔
263
                log.debug(
1✔
264
                    "[step] Built parts_list at %s (score=%.2f)",
265
                    pl_candidate.bbox,
266
                    pl_candidate.score,
267
                )
UNCOV
268
            except Exception as e:
×
UNCOV
269
                log.debug(
×
270
                    "[step] Failed to construct parts_list candidate at %s: %s",
271
                    pl_candidate.bbox,
272
                    e,
273
                )
274

275
        # Phase 3: Build all arrows BEFORE subassemblies.
276
        # Arrows connect subassembly callout boxes to main diagrams. The arrow
277
        # shaft may be a thin rectangle that overlaps the subassembly's white
278
        # box. By building arrows first, they claim the shaft blocks before
279
        # subassemblies can consume them.
280
        #
281
        # IMPORTANT: Only build arrows if there are step candidates.
282
        # On INFO pages, there are no steps to assign arrows to, so building
283
        # them would create orphaned elements that trigger validation errors.
284
        # Note: has_step_candidates was computed in the pre-check.
285
        if has_step_candidates:
1✔
286
            for arrow_candidate in result.get_scored_candidates(
1✔
287
                "arrow", valid_only=False, exclude_failed=True
288
            ):
289
                try:
1✔
290
                    result.build(arrow_candidate)
1✔
291
                    log.debug(
1✔
292
                        "[step] Built arrow at %s (score=%.2f)",
293
                        arrow_candidate.bbox,
294
                        arrow_candidate.score,
295
                    )
UNCOV
296
                except Exception as e:
×
UNCOV
297
                    log.debug(
×
298
                        "[step] Failed to construct arrow candidate at %s: %s",
299
                        arrow_candidate.bbox,
300
                        e,
301
                    )
302
        else:
303
            log.debug("[step] Skipping arrow build - no step candidates on page")
1✔
304

305
        # Phase 4: Build subassemblies and previews BEFORE steps.
306
        # Both subassemblies and previews are white boxes with diagrams inside.
307
        # We combine them and build in score order so the higher-scoring
308
        # candidate claims the white box first. When a candidate is built,
309
        # its source_blocks are marked as consumed, causing any competing
310
        # candidate using the same blocks to fail.
311
        #
312
        # This allows subassemblies (which have step_count labels like "2x")
313
        # to be distinguished from previews (which appear before steps).
314
        subassembly_candidates = result.get_scored_candidates(
1✔
315
            "subassembly", valid_only=False, exclude_failed=True
316
        )
317
        preview_candidates = result.get_scored_candidates(
1✔
318
            "preview", valid_only=False, exclude_failed=True
319
        )
320

321
        # Combine and sort by score (highest first)
322
        combined_candidates = list(subassembly_candidates) + list(preview_candidates)
1✔
323
        combined_candidates.sort(key=lambda c: c.score, reverse=True)
1✔
324

325
        for candidate in combined_candidates:
1✔
326
            try:
1✔
327
                result.build(candidate)
1✔
328
                log.debug(
1✔
329
                    "[step] Built %s at %s (score=%.2f)",
330
                    candidate.label,
331
                    candidate.bbox,
332
                    candidate.score,
333
                )
334
            except Exception as e:
1✔
335
                log.debug(
1✔
336
                    "[step] Failed to construct %s candidate at %s: %s",
337
                    candidate.label,
338
                    candidate.bbox,
339
                    e,
340
                )
341

342
        # Phase 5: Build Step candidates with deduplication by step value
343
        # Filter out candidates that look like substeps (small step numbers with
344
        # a significant gap from main step numbers). These are handled by
345
        # SubStepClassifier instead.
346
        # Steps are built as "partial" - just step_number + parts_list.
347
        # Diagrams and subassemblies are assigned later via Hungarian matching.
348
        # Note: all_step_candidates was already fetched in Phase 3.
349
        page_level_step_candidates = self._filter_page_level_step_candidates(
1✔
350
            all_step_candidates
351
        )
352

353
        # Sort by score (highest first) so best candidates get built first
354
        sorted_candidates = sorted(
1✔
355
            page_level_step_candidates,
356
            key=lambda c: c.score,
357
            reverse=True,
358
        )
359

360
        steps: list[Step] = []
1✔
361
        built_step_values: set[int] = set()
1✔
362

363
        for step_candidate in sorted_candidates:
1✔
364
            # Get step value from score
365
            score = step_candidate.score_details
1✔
366
            if not isinstance(score, _StepScore):
1✔
UNCOV
367
                continue
×
368

369
            # Skip if we've already built a step with this value
370
            if score.step_value in built_step_values:
1✔
371
                log.debug(
1✔
372
                    "[step] Skipping duplicate step %d candidate (score=%.2f)",
373
                    score.step_value,
374
                    step_candidate.score,
375
                )
376
                continue
1✔
377

378
            try:
1✔
379
                elem = result.build(step_candidate)
1✔
380
                assert isinstance(elem, Step)
1✔
381
                steps.append(elem)
1✔
382
                built_step_values.add(score.step_value)
1✔
383
                log.debug(
1✔
384
                    "[step] Built partial step %d (parts_list=%s, score=%.2f)",
385
                    score.step_value,
386
                    "yes" if score.parts_list_candidate is not None else "no",
387
                    step_candidate.score,
388
                )
389
            except Exception as e:
1✔
390
                log.debug(
1✔
391
                    "[step] Failed to construct step %d candidate: %s",
392
                    score.step_value,
393
                    e,
394
                )
395

396
        # Sort steps by their step_number value
397
        steps.sort(key=lambda step: step.step_number.value)
1✔
398

399
        # Phase 6: Build diagrams (not yet claimed by subassemblies)
400
        # Phase 7: Assign diagrams to steps using Hungarian matching
401
        # Collect available diagram candidates (not yet claimed by subassemblies)
402
        available_diagrams: list[Diagram] = []
1✔
403
        for diag_candidate in result.get_scored_candidates(
1✔
404
            "diagram", valid_only=False, exclude_failed=True
405
        ):
406
            if diag_candidate.constructed is None:
1✔
407
                # Build the diagram
408
                try:
1✔
409
                    diag_elem = result.build(diag_candidate)
1✔
410
                    assert isinstance(diag_elem, Diagram)
1✔
411
                    available_diagrams.append(diag_elem)
1✔
412
                except Exception as e:
1✔
413
                    log.debug(
1✔
414
                        "[step] Failed to build diagram at %s: %s",
415
                        diag_candidate.bbox,
416
                        e,
417
                    )
418
            elif isinstance(diag_candidate.constructed, Diagram):
1✔
419
                # Already built (claimed by subassembly) - skip
420
                pass
1✔
421

422
        log.debug(
1✔
423
            "[step] Available diagrams for step assignment: %d",
424
            len(available_diagrams),
425
        )
426

427
        # Collect divider bboxes for obstruction checking (used by multiple phases)
428
        divider_bboxes: list[BBox] = []
1✔
429
        for div_candidate in result.get_scored_candidates("divider", valid_only=True):
1✔
430
            assert div_candidate.constructed is not None
1✔
431
            assert isinstance(div_candidate.constructed, Divider)
1✔
432
            divider_bboxes.append(div_candidate.constructed.bbox)
1✔
433

434
        # Assign diagrams to steps using Hungarian matching
435
        # Dividers prevent pairing across page divisions
436
        assign_diagrams_to_steps(steps, available_diagrams, divider_bboxes)
1✔
437

438
        # Phase 8: Assign subassemblies to steps using Hungarian matching
439
        # Collect built subassemblies
440
        subassemblies: list[SubAssembly] = []
1✔
441
        for sa_candidate in result.get_scored_candidates(
1✔
442
            "subassembly", valid_only=True
443
        ):
444
            assert sa_candidate.constructed is not None
1✔
445
            assert isinstance(sa_candidate.constructed, SubAssembly)
1✔
446
            subassemblies.append(sa_candidate.constructed)
1✔
447

448
        log.debug(
1✔
449
            "[step] Assigning %d subassemblies to %d steps (%d dividers for "
450
            "obstruction checking)",
451
            len(subassemblies),
452
            len(steps),
453
            len(divider_bboxes),
454
        )
455

456
        # Assign subassemblies to steps using Hungarian matching
457
        assign_subassemblies_to_steps(steps, subassemblies, divider_bboxes)
1✔
458

459
        # Phase 8b: Get unclaimed SubSteps as naked substeps
460
        # SubStepClassifier found small step number + diagram pairs.
461
        # SubAssemblyClassifier claimed those inside callout boxes.
462
        # The remaining ones become "naked" substeps for the main step.
463
        if steps:
1✔
464
            substeps = self._get_unclaimed_substeps(result)
1✔
465
            if substeps:
1✔
466
                # Assign all substeps to the single main step on the page
467
                # (In the future, we could use Hungarian matching if there are
468
                # multiple main steps, but for now this covers the common case)
469
                main_step = steps[0]  # Usually only one main step when substeps exist
1✔
470
                object.__setattr__(main_step, "substeps", substeps)
1✔
471
                log.debug(
1✔
472
                    "[step] Assigned %d substeps to step %d",
473
                    len(substeps),
474
                    main_step.step_number.value,
475
                )
476

477
        # Phase 9: Finalize steps - collect arrows and compute final bboxes
478
        page_bbox = result.page_data.bbox
1✔
479
        for step in steps:
1✔
480
            # Get arrows for this step (pass subassemblies for search region)
481
            arrows = self._get_arrows_for_step(
1✔
482
                step.step_number, step.diagram, step.subassemblies, result
483
            )
484

485
            # Compute final bbox including all components
486
            final_bbox = self._compute_step_bbox(
1✔
487
                step.step_number,
488
                step.parts_list,
489
                step.diagram,
490
                step.subassemblies,
491
                page_bbox,
492
            )
493

494
            # Update the step (Step is a Pydantic model, so we need to reassign)
495
            # We mutate in place by directly setting attributes
496
            object.__setattr__(step, "arrows", arrows)
1✔
497
            object.__setattr__(step, "bbox", final_bbox)
1✔
498

499
        # Phase 10: Assign rotation symbols to steps using Hungarian matching
500
        # Collect built rotation symbols
501
        rotation_symbols: list[RotationSymbol] = []
1✔
502
        for rs_candidate in result.get_scored_candidates(
1✔
503
            "rotation_symbol", valid_only=True
504
        ):
505
            assert rs_candidate.constructed is not None
1✔
506
            assert isinstance(rs_candidate.constructed, RotationSymbol)
1✔
507
            rotation_symbols.append(rs_candidate.constructed)
1✔
508

509
        assign_rotation_symbols_to_steps(steps, rotation_symbols)
1✔
510

511
        log.debug(
1✔
512
            "[step] build_all complete: %d steps, %d rotation symbols assigned",
513
            len(steps),
514
            sum(1 for s in steps if s.rotation_symbol is not None),
515
        )
516

517
        # Cast for type compatibility with base class return type
518
        return list[LegoPageElements](steps)
1✔
519

520
    def _filter_page_level_step_candidates(
1✔
521
        self, candidates: list[Candidate]
522
    ) -> list[Candidate]:
523
        """Filter step candidates to exclude likely substep candidates.
524

525
        When a page has both low-numbered steps (1, 2, 3, 4) and high-numbered
526
        steps (338), with a significant gap between them, the low numbers are
527
        likely substeps rather than main steps. These are handled by
528
        SubStepClassifier instead.
529

530
        Args:
531
            candidates: All step candidates to filter
532

533
        Returns:
534
            Page-level step candidates (excluding substep candidates)
535
        """
536
        # Extract step number values from candidates
537
        items_with_values: list[tuple[int, Candidate]] = []
1✔
538
        for candidate in candidates:
1✔
539
            score = candidate.score_details
1✔
540
            if not isinstance(score, _StepScore):
1✔
UNCOV
541
                continue
×
542
            items_with_values.append((score.step_value, candidate))
1✔
543

544
        # Use generic filtering function to find page-level steps
545
        filtered_items = filter_subassembly_values(items_with_values)
1✔
546

547
        # If filtering occurred, return only page-level candidates
548
        if len(filtered_items) != len(items_with_values):
1✔
549
            page_level_values = {v for v, _ in filtered_items}
1✔
550
            page_level_candidates = [
1✔
551
                item for v, item in items_with_values if v in page_level_values
552
            ]
553
            log.debug(
1✔
554
                "[step] Filtered to page-level candidates: %s (excluded substeps: %s)",
555
                sorted(page_level_values),
556
                sorted(v for v, _ in items_with_values if v not in page_level_values),
557
            )
558
            return page_level_candidates
1✔
559

560
        # No filtering occurred, return all as page-level
561
        return candidates
1✔
562

563
    def _get_unclaimed_substeps(
1✔
564
        self,
565
        result: ClassificationResult,
566
    ) -> list[SubStep]:
567
        """Get SubStep elements that weren't claimed by a SubAssembly.
568

569
        SubStepClassifier creates candidates for all small step number +
570
        diagram pairs. SubAssemblyClassifier then claims those inside callout boxes.
571
        This method returns the remaining unclaimed SubSteps for use as
572
        "naked" substeps in a Step.
573

574
        The substeps are filtered to only include valid sequences starting from 1.
575
        This helps filter out PieceLengths or other misidentified elements.
576

577
        Args:
578
            result: Classification result
579

580
        Returns:
581
            List of SubStep elements that weren't claimed by any SubAssembly,
582
            sorted by step number value, and filtered to valid sequences from 1.
583
        """
584
        # Get all substep candidates
585
        substep_candidates = result.get_scored_candidates(
1✔
586
            "substep", valid_only=False, exclude_failed=True
587
        )
588

589
        # Filter to only unclaimed candidates (not yet built)
590
        unclaimed_candidates = [c for c in substep_candidates if c.constructed is None]
1✔
591

592
        if not unclaimed_candidates:
1✔
593
            return []
1✔
594

595
        # Extract step values from candidates to filter BEFORE building
596
        def get_step_value(candidate: Candidate) -> int:
1✔
597
            """Get step value from a substep candidate's score."""
598
            score = candidate.score_details
1✔
599
            assert isinstance(score, _SubStepScore)
1✔
600

601
            return score.step_value
1✔
602

603
        # Filter candidates to valid sequential substeps (must start from 1)
604
        valid_candidates = filter_valid_substep_sequence(
1✔
605
            unclaimed_candidates, get_step_value
606
        )
607

608
        if not valid_candidates:
1✔
609
            log.debug("[step] No valid substep sequence found (no step 1)")
1✔
610
            return []
1✔
611

612
        # Now build only the filtered candidates
613
        substeps: list[SubStep] = []
1✔
614
        for candidate in valid_candidates:
1✔
615
            try:
1✔
616
                substep_elem = result.build(candidate)
1✔
617
                assert isinstance(substep_elem, SubStep)
1✔
618
                substeps.append(substep_elem)
1✔
619
            except CandidateFailedError:
1✔
620
                # Already claimed - skip
621
                continue
1✔
622

623
        # Sort by step number value
624
        substeps.sort(key=lambda s: s.step_number.value)
1✔
625

626
        log.debug(
1✔
627
            "[step] Found %d unclaimed substeps (after sequence filtering)",
628
            len(substeps),
629
        )
630

631
        return substeps
1✔
632

633
    def build(self, candidate: Candidate, result: ClassificationResult) -> Step:
1✔
634
        """Construct a partial Step element from a single candidate.
635

636
        This creates a Step with just step_number and parts_list. The diagram,
637
        subassemblies, and arrows are assigned later in build_all() using
638
        Hungarian matching for optimal global assignment.
639
        """
640
        score = candidate.score_details
1✔
641
        assert isinstance(score, _StepScore)
1✔
642

643
        # Validate and extract step number from parent candidate
644
        step_num_candidate = score.step_number_candidate
1✔
645

646
        step_num_elem = result.build(step_num_candidate)
1✔
647
        assert isinstance(step_num_elem, StepNumber)
1✔
648
        step_num = step_num_elem
1✔
649

650
        # Validate and extract parts list from parent candidate (if present)
651
        parts_list = None
1✔
652
        if score.parts_list_candidate:
1✔
653
            parts_list_candidate = score.parts_list_candidate
1✔
654
            parts_list_elem = result.build(parts_list_candidate)
1✔
655
            assert isinstance(parts_list_elem, PartsList)
1✔
656
            parts_list = parts_list_elem
1✔
657

658
        # Create partial step - diagram, subassemblies, arrows assigned later
659
        initial_bbox = step_num.bbox
1✔
660
        if parts_list:
1✔
661
            initial_bbox = initial_bbox.union(parts_list.bbox)
1✔
662

663
        return Step(
1✔
664
            bbox=initial_bbox,
665
            step_number=step_num,
666
            parts_list=parts_list,
667
            diagram=None,
668
            rotation_symbol=None,
669
            arrows=[],
670
            subassemblies=[],
671
        )
672

673
    def _find_and_build_diagram_for_step(
1✔
674
        self,
675
        step_num: StepNumber,
676
        parts_list: PartsList | None,
677
        result: ClassificationResult,
678
    ) -> Diagram | None:
679
        """Find and build the best diagram for this step.
680

681
        This is called at build time, after rotation symbols and subassemblies
682
        have been built (in build_all Phases 1 and 3), so they've already
683
        claimed any images/diagrams they need. We filter to only consider
684
        diagram candidates that haven't been constructed yet.
685

686
        Args:
687
            step_num: The built step number element
688
            parts_list: The built parts list element (if any)
689
            result: Classification result containing diagram candidates
690

691
        Returns:
692
            The built Diagram element, or None if no suitable diagram found
693
        """
694
        # Get all non-failed, non-constructed diagram candidates
UNCOV
695
        diagram_candidates = result.get_scored_candidates(
×
696
            "diagram", valid_only=False, exclude_failed=True
697
        )
698

699
        # Filter to only candidates that haven't been built yet
700
        # (subassemblies and rotation symbols built earlier may have claimed some)
701
        available_candidates = [c for c in diagram_candidates if c.constructed is None]
×
702

703
        if not available_candidates:
×
704
            log.debug(
×
705
                "[step] No diagram candidates available for step %d",
706
                step_num.value,
707
            )
UNCOV
708
            return None
×
709

710
        # Score each candidate based on position relative to step
UNCOV
711
        step_bbox = step_num.bbox
×
UNCOV
712
        best_candidate = None
×
713
        best_score = -float("inf")
×
714

715
        for candidate in available_candidates:
×
UNCOV
716
            score = self._score_step_diagram_pair(step_bbox, candidate.bbox)
×
717

718
            log.debug(
×
719
                "[step] Diagram candidate at %s for step %d: score=%.2f",
720
                candidate.bbox,
721
                step_num.value,
722
                score,
723
            )
724

UNCOV
725
            if score > best_score:
×
726
                best_score = score
×
727
                best_candidate = candidate
×
728

729
        if best_candidate is None or best_score < 0.2:
×
UNCOV
730
            log.debug(
×
731
                "[step] No suitable diagram found for step %d (best_score=%.2f)",
732
                step_num.value,
733
                best_score,
734
            )
735
            return None
×
736

737
        # Build the diagram
UNCOV
738
        try:
×
UNCOV
739
            diagram_elem = result.build(best_candidate)
×
UNCOV
740
            assert isinstance(diagram_elem, Diagram)
×
UNCOV
741
            log.debug(
×
742
                "[step] Built diagram at %s for step %d (score=%.2f)",
743
                diagram_elem.bbox,
744
                step_num.value,
745
                best_score,
746
            )
UNCOV
747
            return diagram_elem
×
UNCOV
748
        except CandidateFailedError as e:
×
UNCOV
749
            log.debug(
×
750
                "[step] Failed to build diagram for step %d: %s",
751
                step_num.value,
752
                e,
753
            )
UNCOV
754
            return None
×
755

756
    def _create_step_candidate(
1✔
757
        self,
758
        step_candidate: Candidate,
759
        parts_list_candidate: Candidate | None,
760
        result: ClassificationResult,
761
    ) -> Candidate | None:
762
        """Create a Step candidate (without diagram assignment).
763

764
        Diagrams are found at build time, not during scoring, to allow
765
        rotation symbols to claim small images first.
766

767
        Args:
768
            step_candidate: The StepNumber candidate for this step
769
            parts_list_candidate: The PartsList candidate to pair with (or None)
770
            result: Classification result
771

772
        Returns:
773
            The created Candidate with score but no construction, or None if
774
            the step number value cannot be extracted.
775
        """
776
        ABOVE_EPS = 2.0  # Small epsilon for "above" check
1✔
777
        ALIGNMENT_THRESHOLD_MULTIPLIER = 1.0  # Max horizontal offset
1✔
778
        DISTANCE_THRESHOLD_MULTIPLIER = 1.0  # Max vertical distance
1✔
779

780
        # Extract step number value from the candidate's score
781
        score_details = step_candidate.score_details
1✔
782
        if not isinstance(score_details, StepNumberScore):
1✔
UNCOV
783
            return None
×
784
        step_value = score_details.step_value
1✔
785
        if step_value == 0:
1✔
UNCOV
786
            return None
×
787

788
        step_bbox = step_candidate.bbox
1✔
789
        parts_list_bbox = parts_list_candidate.bbox if parts_list_candidate else None
1✔
790

791
        # Calculate pairing scores if there's a parts_list above the step
792
        proximity_score = 0.0
1✔
793
        alignment_score = 0.0
1✔
794

795
        if (
1✔
796
            parts_list_bbox is not None
797
            and parts_list_bbox.y1 <= step_bbox.y0 + ABOVE_EPS
798
        ):
799
            # Calculate distance (how far apart vertically)
800
            distance = step_bbox.y0 - parts_list_bbox.y1
1✔
801

802
            # Calculate proximity score
803
            max_distance = step_bbox.height * DISTANCE_THRESHOLD_MULTIPLIER
1✔
804
            if max_distance > 0:
1✔
805
                proximity_score = max(0.0, 1.0 - (distance / max_distance))
1✔
806

807
            # Calculate alignment score (how well left edges align)
808
            max_alignment_diff = step_bbox.width * ALIGNMENT_THRESHOLD_MULTIPLIER
1✔
809
            left_diff = abs(parts_list_bbox.x0 - step_bbox.x0)
1✔
810
            if max_alignment_diff > 0:
1✔
811
                alignment_score = max(0.0, 1.0 - (left_diff / max_alignment_diff))
1✔
812

813
        # Create score object with candidate references
814
        # Diagrams are found at build time, not during scoring
815
        score = _StepScore(
1✔
816
            step_value=step_value,
817
            step_number_candidate=step_candidate,
818
            parts_list_candidate=parts_list_candidate,
819
            has_parts_list=parts_list_candidate is not None,
820
            step_proximity_score=proximity_score,
821
            step_alignment_score=alignment_score,
822
        )
823

824
        # Calculate combined bbox for the candidate (without diagram)
825
        combined_bbox = step_bbox
1✔
826
        if parts_list_bbox:
1✔
827
            combined_bbox = BBox.union(combined_bbox, parts_list_bbox)
1✔
828

829
        # Create candidate
830
        return Candidate(
1✔
831
            bbox=combined_bbox,
832
            label="step",
833
            score=score.overall_score(),
834
            score_details=score,
835
            source_blocks=[],
836
        )
837

838
    def _score_step_diagram_pair(
1✔
839
        self,
840
        step_bbox: BBox,
841
        diagram_bbox: BBox,
842
    ) -> float:
843
        """Score how well a diagram matches a step.
844

845
        Diagrams are typically positioned to the right of and/or below the step
846
        number. This method scores based on:
847
        - Horizontal position: prefer diagrams to the right, penalize left
848
        - Vertical position: prefer diagrams below the step header
849
        - Distance: closer is better
850

851
        Args:
852
            step_bbox: The step number bounding box
853
            diagram_bbox: The diagram bounding box to score
854

855
        Returns:
856
            Score between 0.0 and 1.0 (higher is better match)
857
        """
858
        # Reference point: bottom-right of step number
859
        ref_x = step_bbox.x1
×
860
        ref_y = step_bbox.y1
×
861

862
        # TODO Move all these constants into config, or make them adaptive?
863

864
        # Horizontal score
865
        # Diagrams to the right are preferred, but allow some overlap
UNCOV
866
        x_offset = diagram_bbox.x0 - ref_x
×
867

UNCOV
868
        if x_offset >= -50:
×
869
            # Diagram starts to the right or slightly overlapping - good
870
            # Score decreases slightly with distance to the right
871
            x_score = max(0.5, 1.0 - abs(x_offset) / 400.0)
×
UNCOV
872
        elif x_offset >= -200:
×
873
            # Diagram is moderately to the left - acceptable
874
            x_score = 0.3 + 0.2 * (1.0 + x_offset / 200.0)
×
875
        else:
876
            # Diagram is far to the left - poor match
877
            x_score = max(0.1, 0.3 + x_offset / 400.0)
×
878

879
        # Vertical score
880
        # Diagrams below the step header are preferred
UNCOV
881
        y_offset = diagram_bbox.y0 - ref_y
×
882

883
        if y_offset >= -30:
×
884
            # Diagram starts below or slightly overlapping - good
885
            # Score decreases with vertical distance
UNCOV
886
            y_score = max(0.3, 1.0 - abs(y_offset) / 300.0)
×
UNCOV
887
        elif y_offset >= -100:
×
888
            # Diagram is moderately above - less good but acceptable
UNCOV
889
            y_score = 0.2 + 0.3 * (1.0 + y_offset / 100.0)
×
890
        else:
891
            # Diagram is far above - poor match
UNCOV
892
            y_score = max(0.05, 0.2 + y_offset / 300.0)
×
893

894
        # Combined score - weight both dimensions equally
UNCOV
895
        score = 0.5 * x_score + 0.5 * y_score
×
896

UNCOV
897
        return score
×
898

899
    def _get_rotation_symbol_for_step(
1✔
900
        self,
901
        step_bbox: BBox,
902
        result: ClassificationResult,
903
    ) -> RotationSymbol | None:
904
        """Find an already-built rotation symbol within this step's area.
905

906
        Rotation symbols are built by PageClassifier before steps are processed.
907
        This method finds the already-built rotation symbol that falls within
908
        the step's bounding box.
909

910
        Args:
911
            step_bbox: The step's bounding box (including step_num, parts_list,
912
                and diagram)
913
            result: Classification result containing rotation symbol candidates
914

915
        Returns:
916
            Single RotationSymbol element for this step, or None if not found
917
        """
918
        # Get rotation_symbol candidates - they should already be built
919
        # by PageClassifier. Use valid_only=True to only get successfully
920
        # constructed rotation symbols.
921
        rotation_symbol_candidates = result.get_scored_candidates(
×
922
            "rotation_symbol", valid_only=True
923
        )
924

UNCOV
925
        log.debug(
×
926
            "[step] Looking for rotation symbols in step bbox %s, "
927
            "found %d built candidates",
928
            step_bbox,
929
            len(rotation_symbol_candidates),
930
        )
931

932
        if not rotation_symbol_candidates:
×
UNCOV
933
            return None
×
934

935
        # Find best-scoring rotation symbol overlapping the step's bbox
UNCOV
936
        overlapping_candidates = filter_overlapping(
×
937
            rotation_symbol_candidates, step_bbox
938
        )
939
        best_candidate = find_best_scoring(overlapping_candidates)
×
940

UNCOV
941
        if best_candidate and best_candidate.constructed is not None:
×
UNCOV
942
            rotation_symbol = best_candidate.constructed
×
UNCOV
943
            assert isinstance(rotation_symbol, RotationSymbol)
×
UNCOV
944
            log.debug(
×
945
                "[step] Found rotation symbol at %s (score=%.2f)",
946
                rotation_symbol.bbox,
947
                best_candidate.score,
948
            )
UNCOV
949
            return rotation_symbol
×
950

UNCOV
951
        log.debug("[step] No rotation symbol found in step bbox %s", step_bbox)
×
UNCOV
952
        return None
×
953

954
    def _get_arrows_for_step(
1✔
955
        self,
956
        step_num: StepNumber,
957
        diagram: Diagram | None,
958
        subassemblies: list[SubAssembly],
959
        result: ClassificationResult,
960
    ) -> list[Arrow]:
961
        """Collect already-built arrows that belong to this step.
962

963
        Arrows are built in Phase 3 of build_all(), before subassemblies.
964
        This method finds arrows that are positioned near the step's diagram
965
        and assigns them to this step.
966

967
        Typically these are arrows pointing from subassembly callout boxes
968
        to the main diagram. The search region includes both the diagram and
969
        any subassemblies assigned to this step.
970

971
        Args:
972
            step_num: The step number element
973
            diagram: The diagram element (if any)
974
            subassemblies: List of subassemblies assigned to this step
975
            result: Classification result containing arrow candidates
976

977
        Returns:
978
            List of Arrow elements for this step
979
        """
980
        # Get already-built arrows (built in Phase 3)
981
        arrow_candidates = result.get_scored_candidates(
1✔
982
            "arrow",
983
            valid_only=True,  # Only get successfully built arrows
984
        )
985

986
        log.debug(
1✔
987
            "[step] Looking for arrows for step %d, found %d built arrows",
988
            step_num.value,
989
            len(arrow_candidates),
990
        )
991

992
        if not arrow_candidates:
1✔
993
            return []
1✔
994

995
        # Build search region from diagram and subassemblies
996
        # Arrows typically connect subassemblies to the main diagram, so we need
997
        # to search both areas
998
        search_bboxes: list[BBox] = []
1✔
999
        if diagram:
1✔
1000
            search_bboxes.append(diagram.bbox)
1✔
1001
        for sa in subassemblies:
1✔
1002
            search_bboxes.append(sa.bbox)
1✔
1003

1004
        # Fallback to step number bbox if no diagram or subassemblies
1005
        if not search_bboxes:
1✔
UNCOV
1006
            search_bboxes.append(step_num.bbox)
×
1007

1008
        # Compute union of all components and expand
1009
        search_bbox = BBox.union_all(search_bboxes)
1✔
1010

1011
        # Expand search region to catch arrows near the components
1012
        # Use a larger margin since arrows can extend further
1013
        search_region = search_bbox.expand(100.0)
1✔
1014

1015
        log.debug(
1✔
1016
            "[step] Arrow search region for step %d: %s",
1017
            step_num.value,
1018
            search_region,
1019
        )
1020

1021
        # Find arrows within or overlapping the search region
1022
        arrows: list[Arrow] = []
1✔
1023
        overlapping_candidates = filter_overlapping(arrow_candidates, search_region)
1✔
1024

1025
        for candidate in overlapping_candidates:
1✔
1026
            assert candidate.constructed is not None
1✔
1027
            assert isinstance(candidate.constructed, Arrow)
1✔
1028
            log.debug(
1✔
1029
                "[step]   Arrow at %s overlaps search region (score=%.2f)",
1030
                candidate.bbox,
1031
                candidate.score,
1032
            )
1033
            arrows.append(candidate.constructed)
1✔
1034

1035
        log.debug(
1✔
1036
            "[step] Found %d arrows for step %d",
1037
            len(arrows),
1038
            step_num.value,
1039
        )
1040
        return arrows
1✔
1041

1042
    def _get_subassemblies_for_step(
1✔
1043
        self,
1044
        step_num: StepNumber,
1045
        diagram: Diagram | None,
1046
        result: ClassificationResult,
1047
    ) -> list[SubAssembly]:
1048
        """Get already-built subassemblies that belong to this step.
1049

1050
        Subassemblies are built in build_all() before steps. This method finds
1051
        subassemblies that are positioned near this step's diagram and haven't
1052
        been claimed by another step yet.
1053

1054
        Args:
1055
            step_num: The step number element
1056
            diagram: The diagram element (if any)
1057
            result: Classification result containing subassembly candidates
1058

1059
        Returns:
1060
            List of SubAssembly elements for this step
1061
        """
1062
        # Get all built subassemblies
UNCOV
1063
        all_subassemblies: list[SubAssembly] = []
×
UNCOV
1064
        for sa_candidate in result.get_scored_candidates(
×
1065
            "subassembly", valid_only=True
1066
        ):
UNCOV
1067
            assert sa_candidate.constructed is not None
×
1068
            assert isinstance(sa_candidate.constructed, SubAssembly)
×
UNCOV
1069
            all_subassemblies.append(sa_candidate.constructed)
×
1070

UNCOV
1071
        log.debug(
×
1072
            "[step] Looking for subassemblies for step %d, found %d built",
1073
            step_num.value,
1074
            len(all_subassemblies),
1075
        )
1076

1077
        if not all_subassemblies:
×
1078
            return []
×
1079

1080
        # Determine search region based on step_number and diagram
UNCOV
1081
        reference_bbox = diagram.bbox.union(step_num.bbox) if diagram else step_num.bbox
×
1082

UNCOV
1083
        page_bbox = result.page_data.bbox
×
1084

1085
        # Expand search region around the reference area
1086
        # Horizontally: search the full page width
1087
        # Vertically: search a region around the reference bbox
UNCOV
1088
        vertical_margin = reference_bbox.height
×
1089
        search_region = BBox(
×
1090
            x0=page_bbox.x0,
1091
            y0=max(page_bbox.y0, reference_bbox.y0 - vertical_margin),
1092
            x1=page_bbox.x1,
1093
            y1=min(page_bbox.y1, reference_bbox.y1 + vertical_margin),
1094
        )
1095

UNCOV
1096
        log.debug(
×
1097
            "[step] SubAssembly search region for step %d: %s",
1098
            step_num.value,
1099
            search_region,
1100
        )
1101

1102
        # Find subassemblies that overlap the search region
UNCOV
1103
        subassemblies: list[SubAssembly] = []
×
UNCOV
1104
        for subassembly in all_subassemblies:
×
UNCOV
1105
            if subassembly.bbox.overlaps(search_region):
×
UNCOV
1106
                log.debug(
×
1107
                    "[step]   SubAssembly at %s overlaps search region",
1108
                    subassembly.bbox,
1109
                )
UNCOV
1110
                subassemblies.append(subassembly)
×
1111

UNCOV
1112
        log.debug(
×
1113
            "[step] Found %d subassemblies for step %d",
1114
            len(subassemblies),
1115
            step_num.value,
1116
        )
UNCOV
1117
        return subassemblies
×
1118

1119
    def _compute_step_bbox(
1✔
1120
        self,
1121
        step_num: StepNumber,
1122
        parts_list: PartsList | None,
1123
        diagram: Diagram | None,
1124
        subassemblies: list[SubAssembly],
1125
        page_bbox: BBox,
1126
    ) -> BBox:
1127
        """Compute the overall bounding box for the Step.
1128

1129
        This encompasses the step number, parts list (if any), diagram (if any),
1130
        and all subassemblies.
1131
        The result is clipped to the page bounds to handle elements that extend
1132
        slightly off-page (e.g., arrows in diagrams).
1133

1134
        Args:
1135
            step_num: The step number element
1136
            parts_list: The parts list (if any)
1137
            diagram: The diagram element (if any)
1138
            subassemblies: List of subassemblies for this step
1139
            page_bbox: The page bounding box to clip to
1140

1141
        Returns:
1142
            Combined bounding box, clipped to page bounds
1143
        """
1144
        bboxes = [step_num.bbox]
1✔
1145
        if parts_list:
1✔
1146
            bboxes.append(parts_list.bbox)
1✔
1147
        if diagram:
1✔
1148
            bboxes.append(diagram.bbox)
1✔
1149
        for sa in subassemblies:
1✔
1150
            bboxes.append(sa.bbox)
1✔
1151

1152
        return BBox.union_all(bboxes).clip_to(page_bbox)
1✔
1153

1154

1155
def assign_diagrams_to_steps(
1✔
1156
    steps: list[Step],
1157
    diagrams: list[Diagram],
1158
    divider_bboxes: Sequence[BBox] = (),
1159
    max_distance: float = 500.0,
1160
) -> None:
1161
    """Assign diagrams to steps using Hungarian algorithm.
1162

1163
    Uses the shared pairing module to find optimal step number to diagram
1164
    pairings based on:
1165
    - Position: step number should be in top-left of diagram
1166
    - Distance: closer is better
1167
    - Dividers: pairings should not cross divider lines
1168

1169
    This function mutates the Step objects in place, setting their diagram
1170
    attribute.
1171

1172
    Args:
1173
        steps: List of Step objects to assign diagrams to
1174
        diagrams: List of Diagram objects to assign
1175
        divider_bboxes: Sequence of divider bounding boxes to check for crossing
1176
        max_distance: Maximum distance for a valid assignment.
1177
    """
1178
    if not steps or not diagrams:
1✔
1179
        log.debug(
1✔
1180
            "[step] No diagram assignment needed: steps=%d, diagrams=%d",
1181
            len(steps),
1182
            len(diagrams),
1183
        )
1184
        return
1✔
1185

1186
    log.debug(
1✔
1187
        "[step] Running Hungarian matching for diagrams: %d steps, %d diagrams",
1188
        len(steps),
1189
        len(diagrams),
1190
    )
1191

1192
    # Configure pairing
1193
    config = PairingConfig(
1✔
1194
        max_distance=max_distance,
1195
        position_weight=0.5,
1196
        distance_weight=0.5,
1197
        check_dividers=True,
1198
        top_left_tolerance=100.0,
1199
    )
1200

1201
    # Extract bboxes
1202
    step_bboxes = [s.step_number.bbox for s in steps]
1✔
1203
    diagram_bboxes = [d.bbox for d in diagrams]
1✔
1204

1205
    # Find optimal pairings using shared logic
1206
    pairings = find_optimal_pairings(
1✔
1207
        step_bboxes, diagram_bboxes, config, divider_bboxes
1208
    )
1209

1210
    # Assign diagrams to steps based on the matching
1211
    for pairing in pairings:
1✔
1212
        step = steps[pairing.step_index]
1✔
1213
        diag = diagrams[pairing.diagram_index]
1✔
1214
        # Use object.__setattr__ because Step is a frozen Pydantic model
1215
        object.__setattr__(step, "diagram", diag)
1✔
1216
        log.debug(
1✔
1217
            "[step] Assigned diagram at %s to step %d (cost=%.2f)",
1218
            diag.bbox,
1219
            step.step_number.value,
1220
            pairing.cost,
1221
        )
1222

1223

1224
def assign_subassemblies_to_steps(
1✔
1225
    steps: list[Step],
1226
    subassemblies: list[SubAssembly],
1227
    divider_bboxes: Sequence[BBox] = (),
1228
    max_distance: float = 400.0,
1229
) -> None:
1230
    """Assign subassemblies to steps using Hungarian algorithm.
1231

1232
    Uses optimal bipartite matching to pair subassemblies with steps based on:
1233
    - Distance from step's diagram to subassembly (closer is better)
1234
    - No divider between the subassembly and the step's diagram
1235

1236
    This function mutates the Step objects in place, adding to their
1237
    subassemblies list.
1238

1239
    Args:
1240
        steps: List of Step objects to assign subassemblies to
1241
        subassemblies: List of SubAssembly objects to assign
1242
        divider_bboxes: Sequence of divider bounding boxes for obstruction checking
1243
        max_distance: Maximum distance for a valid assignment
1244
    """
1245
    if not steps or not subassemblies:
1✔
1246
        log.debug(
1✔
1247
            "[step] No subassembly assignment needed: steps=%d, subassemblies=%d",
1248
            len(steps),
1249
            len(subassemblies),
1250
        )
1251
        return
1✔
1252

1253
    n_steps = len(steps)
1✔
1254
    n_subassemblies = len(subassemblies)
1✔
1255

1256
    log.debug(
1✔
1257
        "[step] Running Hungarian matching for subassemblies: "
1258
        "%d steps, %d subassemblies",
1259
        n_steps,
1260
        n_subassemblies,
1261
    )
1262

1263
    # Build cost matrix: rows = subassemblies, cols = steps
1264
    cost_matrix = np.zeros((n_subassemblies, n_steps))
1✔
1265
    high_cost = max_distance * 10
1✔
1266

1267
    for i, sa in enumerate(subassemblies):
1✔
1268
        sa_center = sa.bbox.center
1✔
1269
        for j, step in enumerate(steps):
1✔
1270
            # Use diagram center if available, otherwise step bbox center
1271
            if step.diagram:
1✔
1272
                target_bbox = step.diagram.bbox
1✔
1273
                target_center = target_bbox.center
1✔
1274
            else:
UNCOV
1275
                target_bbox = step.bbox
×
UNCOV
1276
                target_center = step.step_number.bbox.center
×
1277

1278
            # Base cost is distance from diagram/step to subassembly center
1279
            distance = (
1✔
1280
                (target_center[0] - sa_center[0]) ** 2
1281
                + (target_center[1] - sa_center[1]) ** 2
1282
            ) ** 0.5
1283

1284
            # Check for divider between subassembly and step's diagram
1285
            if has_divider_between(sa.bbox, target_bbox, divider_bboxes):
1✔
1286
                # High cost if there's a divider between
1287
                distance = high_cost
1✔
1288
                log.debug(
1✔
1289
                    "[step]   Divider between subassembly[%d] at %s and step[%d]",
1290
                    i,
1291
                    sa.bbox,
1292
                    step.step_number.value,
1293
                )
1294

1295
            cost_matrix[i, j] = distance
1✔
1296
            log.debug(
1✔
1297
                "[step]   Cost subassembly[%d] at %s to step[%d]: %.1f",
1298
                i,
1299
                sa.bbox,
1300
                step.step_number.value,
1301
                distance,
1302
            )
1303

1304
    # Apply max_distance threshold
1305
    cost_matrix_thresholded = np.where(
1✔
1306
        cost_matrix > max_distance, high_cost, cost_matrix
1307
    )
1308

1309
    # Run Hungarian algorithm
1310
    row_indices, col_indices = linear_sum_assignment(cost_matrix_thresholded)
1✔
1311

1312
    # Assign subassemblies to steps based on the matching
1313
    # Note: Unlike diagrams, multiple subassemblies can be assigned to one step
1314
    # The Hungarian algorithm gives us the best 1:1 matching, but we may want
1315
    # to assign additional nearby subassemblies too
1316

1317
    # First, do the 1:1 optimal assignment
1318
    assigned_subassemblies: set[int] = set()
1✔
1319
    for row_idx, col_idx in zip(row_indices, col_indices, strict=True):
1✔
1320
        if cost_matrix[row_idx, col_idx] <= max_distance:
1✔
1321
            sa = subassemblies[row_idx]
1✔
1322
            step = steps[col_idx]
1✔
1323
            # Add to step's subassemblies list
1324
            new_subassemblies = list(step.subassemblies) + [sa]
1✔
1325
            object.__setattr__(step, "subassemblies", new_subassemblies)
1✔
1326
            assigned_subassemblies.add(row_idx)
1✔
1327
            log.debug(
1✔
1328
                "[step] Assigned subassembly at %s to step %d (cost=%.1f)",
1329
                sa.bbox,
1330
                step.step_number.value,
1331
                cost_matrix[row_idx, col_idx],
1332
            )
1333

1334
    # Then, try to assign remaining unassigned subassemblies to their nearest step
1335
    # (if within max_distance and no divider between)
1336
    for i, sa in enumerate(subassemblies):
1✔
1337
        if i in assigned_subassemblies:
1✔
1338
            continue
1✔
1339

1340
        # Find the step with lowest cost for this subassembly
1341
        best_step_idx = None
1✔
1342
        best_cost = high_cost
1✔
1343
        for j in range(len(steps)):
1✔
1344
            if cost_matrix[i, j] < best_cost:
1✔
1345
                best_cost = cost_matrix[i, j]
1✔
1346
                best_step_idx = j
1✔
1347

1348
        if best_step_idx is not None and best_cost <= max_distance:
1✔
1349
            step = steps[best_step_idx]
1✔
1350
            new_subassemblies = list(step.subassemblies) + [sa]
1✔
1351
            object.__setattr__(step, "subassemblies", new_subassemblies)
1✔
1352
            log.debug(
1✔
1353
                "[step] Assigned extra subassembly at %s to step %d (cost=%.1f)",
1354
                sa.bbox,
1355
                step.step_number.value,
1356
                best_cost,
1357
            )
1358

1359

1360
def assign_rotation_symbols_to_steps(
1✔
1361
    steps: list[Step],
1362
    rotation_symbols: list[RotationSymbol],
1363
    max_distance: float = 300.0,
1364
) -> None:
1365
    """Assign rotation symbols to steps using Hungarian algorithm.
1366

1367
    Uses optimal bipartite matching to pair rotation symbols with steps based on
1368
    minimum distance from the rotation symbol to the step's diagram (or step bbox
1369
    center if no diagram).
1370

1371
    This function mutates the Step objects in place, setting their rotation_symbol
1372
    attribute.
1373

1374
    Args:
1375
        steps: List of Step objects to assign rotation symbols to
1376
        rotation_symbols: List of RotationSymbol objects to assign
1377
        max_distance: Maximum distance for a valid assignment. Pairs with distance
1378
            greater than this will not be matched.
1379
    """
1380
    if not steps or not rotation_symbols:
1✔
1381
        log.debug(
1✔
1382
            "[step] No rotation symbol assignment needed: "
1383
            "steps=%d, rotation_symbols=%d",
1384
            len(steps),
1385
            len(rotation_symbols),
1386
        )
1387
        return
1✔
1388

1389
    n_steps = len(steps)
1✔
1390
    n_symbols = len(rotation_symbols)
1✔
1391

1392
    log.debug(
1✔
1393
        "[step] Running Hungarian matching: %d steps, %d rotation symbols",
1394
        n_steps,
1395
        n_symbols,
1396
    )
1397

1398
    # Build cost matrix: rows = rotation symbols, cols = steps
1399
    # Cost = distance from rotation symbol center to step's diagram center
1400
    # (or step bbox center if no diagram)
1401
    cost_matrix = np.zeros((n_symbols, n_steps))
1✔
1402

1403
    for i, rs in enumerate(rotation_symbols):
1✔
1404
        rs_center = rs.bbox.center
1✔
1405
        for j, step in enumerate(steps):
1✔
1406
            # Use diagram center if available, otherwise step bbox center
1407
            if step.diagram:
1✔
1408
                target_center = step.diagram.bbox.center
1✔
1409
            else:
UNCOV
1410
                target_center = step.bbox.center
×
1411

1412
            # Euclidean distance
1413
            distance = (
1✔
1414
                (rs_center[0] - target_center[0]) ** 2
1415
                + (rs_center[1] - target_center[1]) ** 2
1416
            ) ** 0.5
1417

1418
            cost_matrix[i, j] = distance
1✔
1419
            log.debug(
1✔
1420
                "[step]   Distance from rotation_symbol[%d] at %s to step[%d]: %.1f",
1421
                i,
1422
                rs.bbox,
1423
                step.step_number.value,
1424
                distance,
1425
            )
1426

1427
    # Apply max_distance threshold by setting costs above threshold to a high value
1428
    # This prevents assignments that are too far apart
1429
    high_cost = max_distance * 10  # Arbitrary large value
1✔
1430
    cost_matrix_thresholded = np.where(
1✔
1431
        cost_matrix > max_distance, high_cost, cost_matrix
1432
    )
1433

1434
    # Run Hungarian algorithm
1435
    row_indices, col_indices = linear_sum_assignment(cost_matrix_thresholded)
1✔
1436

1437
    # Assign rotation symbols to steps based on the matching
1438
    for row_idx, col_idx in zip(row_indices, col_indices, strict=True):
1✔
1439
        # Check if this assignment is within the max_distance threshold
1440
        if cost_matrix[row_idx, col_idx] <= max_distance:
1✔
1441
            rs = rotation_symbols[row_idx]
1✔
1442
            step = steps[col_idx]
1✔
1443
            step.rotation_symbol = rs
1✔
1444
            # Expand step bbox to include the rotation symbol
1445
            step.bbox = step.bbox.union(rs.bbox)
1✔
1446
            log.debug(
1✔
1447
                "[step] Assigned rotation symbol at %s to step %d "
1448
                "(distance=%.1f, new step bbox=%s)",
1449
                rs.bbox,
1450
                step.step_number.value,
1451
                cost_matrix[row_idx, col_idx],
1452
                step.bbox,
1453
            )
1454
        else:
UNCOV
1455
            log.debug(
×
1456
                "[step] Skipped assignment: rotation_symbol[%d] to step[%d] "
1457
                "distance=%.1f > max_distance=%.1f",
1458
                row_idx,
1459
                col_indices[row_idx] if row_idx < len(col_indices) else -1,
1460
                cost_matrix[row_idx, col_idx],
1461
                max_distance,
1462
            )
1463

1464

1465
def filter_subassembly_values[T](
1✔
1466
    items: list[tuple[int, T]],
1467
    *,
1468
    min_gap: int = 3,
1469
    max_subassembly_start: int = 3,
1470
) -> list[tuple[int, T]]:
1471
    """Filter items to exclude likely subassembly values.
1472

1473
    Subassembly steps (e.g., step 1, 2 inside a subassembly box) often appear
1474
    alongside higher-numbered page-level steps (e.g., 15, 16). This creates
1475
    sequences like [1, 2, 15, 16] which cannot all be page-level steps.
1476

1477
    This function identifies such cases by detecting a significant gap in values
1478
    and filtering out the lower-numbered items that are likely subassembly steps.
1479

1480
    Args:
1481
        items: List of (value, item) tuples where value is the step number
1482
            and item is any associated data (e.g., a Candidate).
1483
        min_gap: Minimum gap size to trigger filtering (exclusive).
1484
            Default is 3, so gaps > 3 trigger filtering.
1485
        max_subassembly_start: Maximum starting value for subassembly detection.
1486
            If min value is <= this, filtering is applied. Default is 3.
1487

1488
    Returns:
1489
        Filtered list of (value, item) tuples, keeping only the higher values
1490
        if a subassembly pattern is detected. Returns original list if no
1491
        filtering is needed.
1492

1493
    Examples:
1494
        >>> filter_subassembly_values([(1, "a"), (2, "b"), (15, "c"), (16, "d")])
1495
        [(15, "c"), (16, "d")]
1496

1497
        >>> filter_subassembly_values([(5, "a"), (6, "b"), (15, "c")])
1498
        [(5, "a"), (6, "b"), (15, "c")]  # min=5 > 3, no filtering
1499
    """
1500
    if len(items) <= 1:
1✔
1501
        return items
1✔
1502

1503
    # Sort by value
1504
    sorted_items = sorted(items, key=lambda x: x[0])
1✔
1505
    values = [v for v, _ in sorted_items]
1✔
1506

1507
    # Find the largest gap between consecutive values
1508
    max_gap_size = 0
1✔
1509
    max_gap_index = -1
1✔
1510
    for i in range(len(values) - 1):
1✔
1511
        gap = values[i + 1] - values[i]
1✔
1512
        if gap > max_gap_size:
1✔
1513
            max_gap_size = gap
1✔
1514
            max_gap_index = i
1✔
1515

1516
    # Check if filtering should occur:
1517
    # 1. Gap must be larger than min_gap
1518
    # 2. The minimum value must be <= max_subassembly_start
1519
    if max_gap_size > min_gap and max_gap_index >= 0:
1✔
1520
        min_value = values[0]
1✔
1521
        if min_value <= max_subassembly_start:
1✔
1522
            threshold = values[max_gap_index + 1]
1✔
1523
            return [(v, item) for v, item in sorted_items if v >= threshold]
1✔
1524

1525
    return items
1✔
1526

1527

1528
def filter_valid_substep_sequence[T](
1✔
1529
    items: list[T],
1530
    get_value: Callable[[T], int],
1531
) -> list[T]:
1532
    """Filter substeps to only include valid sequential values starting from 1.
1533

1534
    Substeps within a subassembly or page must form a valid sequence:
1535
    - Must start from 1
1536
    - Values must be consecutive (1, 2, 3, ... not 1, 3, 5)
1537

1538
    This filters out isolated numbers that are likely PieceLengths or other
1539
    misidentified elements.
1540

1541
    Args:
1542
        items: List of substep items to filter
1543
        get_value: Function to extract the step number value from an item
1544

1545
    Returns:
1546
        Filtered list containing only items that form a valid sequence from 1.
1547
        Returns empty list if no valid sequence starting from 1 is found.
1548

1549
    Examples:
1550
        >>> filter_valid_substep_sequence([1, 2, 3], lambda x: x)
1551
        [1, 2, 3]
1552

1553
        >>> filter_valid_substep_sequence([3, 5], lambda x: x)
1554
        []  # No item with value 1
1555

1556
        >>> filter_valid_substep_sequence([1, 2, 5], lambda x: x)
1557
        [1, 2]  # 5 breaks the sequence
1558
    """
1559
    if not items:
1✔
UNCOV
1560
        return []
×
1561

1562
    # Build a map from value to items
1563
    value_to_items: dict[int, list[T]] = {}
1✔
1564
    for item in items:
1✔
1565
        value = get_value(item)
1✔
1566
        if value not in value_to_items:
1✔
1567
            value_to_items[value] = []
1✔
1568
        value_to_items[value].append(item)
1✔
1569

1570
    # Must have step 1 to be a valid sequence
1571
    if 1 not in value_to_items:
1✔
1572
        log.debug("[substep_filter] No substep with value 1 found - rejecting all")
1✔
1573
        return []
1✔
1574

1575
    # Collect items in sequence starting from 1
1576
    result: list[T] = []
1✔
1577
    expected_value = 1
1✔
1578

1579
    while expected_value in value_to_items:
1✔
1580
        result.extend(value_to_items[expected_value])
1✔
1581
        expected_value += 1
1✔
1582

1583
    log.debug(
1✔
1584
        "[substep_filter] Filtered %d items to %d valid sequential substeps (1-%d)",
1585
        len(items),
1586
        len(result),
1587
        expected_value - 1,
1588
    )
1589

1590
    return result
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc