• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 20472199734

23 Dec 2025 09:39PM UTC coverage: 88.693% (+0.2%) from 88.542%
20472199734

push

github

bramp
Add no-orphan constraints for Step child elements

- Add no-orphan constraints to StepClassifier.declare_constraints() for:
  - arrows (point from subassembly callouts to main diagram)
  - rotation_symbols (indicate model rotation)
  - subassemblies (callout boxes within steps)
  - substeps (mini-steps within a main step)
  - diagrams (the main instruction graphic)

- If any of these elements are selected, at least one step must also be
  selected, preventing orphaned elements

- Add unit tests for no-orphan constraint declaration

- Update architecture docs with no-orphan constraint documentation

- Add TODO for potential future centralization in SchemaConstraintGenerator

66 of 67 new or added lines in 2 files covered. (98.51%)

151 existing lines in 8 files now uncovered.

14786 of 16671 relevant lines covered (88.69%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

81.62
/src/build_a_long/pdf_extract/classifier/steps/step_classifier.py
1
"""
2
Step classifier.
3

4
Purpose
5
-------
6
Identify complete Step structures by combining step_number, parts_list, and diagram
7
elements. A Step represents a single building instruction comprising:
8
- A StepNumber label
9
- An optional PartsList (the parts needed for this step)
10
- A Diagram (the main instruction graphic showing what to build)
11

12
We look for step_numbers and attempt to pair them with nearby parts_lists and
13
identify the appropriate diagram region for each step.
14

15
Debugging
16
---------
17
Set environment variables to aid investigation without code changes:
18

19
- LOG_LEVEL=DEBUG
20
    Enables DEBUG-level logging (if not already configured by caller).
21
"""
22

23
import logging
1✔
24
from collections.abc import Callable, Sequence
1✔
25

26
import numpy as np
1✔
27
from scipy.optimize import linear_sum_assignment
1✔
28

29
from build_a_long.pdf_extract.classifier.candidate import Candidate
1✔
30
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
31
    CandidateFailedError,
32
    ClassificationResult,
33
)
34
from build_a_long.pdf_extract.classifier.constraint_model import ConstraintModel
1✔
35
from build_a_long.pdf_extract.classifier.label_classifier import (
1✔
36
    LabelClassifier,
37
)
38
from build_a_long.pdf_extract.classifier.rule_based_classifier import (
1✔
39
    StepNumberScore,
40
)
41
from build_a_long.pdf_extract.classifier.score import (
1✔
42
    Score,
43
    Weight,
44
    find_best_scoring,
45
)
46
from build_a_long.pdf_extract.classifier.steps.pairing import (
1✔
47
    PairingConfig,
48
    find_optimal_pairings,
49
    has_divider_between,
50
)
51
from build_a_long.pdf_extract.classifier.steps.substep_classifier import _SubStepScore
1✔
52
from build_a_long.pdf_extract.extractor.bbox import BBox, filter_overlapping
1✔
53
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
54
    Arrow,
55
    Diagram,
56
    Divider,
57
    LegoPageElements,
58
    PartsList,
59
    RotationSymbol,
60
    Step,
61
    StepNumber,
62
    SubAssembly,
63
    SubStep,
64
)
65

66
log = logging.getLogger(__name__)
1✔
67

68

69
class _StepScore(Score):
1✔
70
    """Internal score representation for step classification."""
71

72
    step_value: int
1✔
73
    """The parsed step number value (e.g., 1, 2, 3)."""
1✔
74

75
    step_number_candidate: Candidate
1✔
76
    """The step number candidate this step is associated with."""
1✔
77

78
    parts_list_candidate: Candidate | None
1✔
79
    """The parts list candidate paired with this step (if any)."""
1✔
80

81
    has_parts_list: bool
1✔
82
    """Whether this step has an associated parts list."""
1✔
83

84
    step_proximity_score: float
1✔
85
    """Score based on proximity to the PartsList above (0.0-1.0).
1✔
86
    1.0 for closest proximity, 0.0 if very far. 0.0 if no parts list."""
87

88
    step_alignment_score: float
1✔
89
    """Score based on left-edge alignment with PartsList above (0.0-1.0).
1✔
90
    1.0 is perfect alignment, 0.0 is very misaligned. 0.0 if no parts list."""
91

92
    def score(self) -> Weight:
1✔
93
        """Return the overall pairing score."""
UNCOV
94
        return self.overall_score()
×
95

96
    def overall_score(self) -> float:
1✔
97
        """Calculate overall quality score based on parts list pairing.
98

99
        Steps with a parts_list are given a bonus to prefer them over
100
        steps without parts_list. Diagrams are found at build time, not
101
        during scoring, to allow rotation symbols to claim small images first.
102
        """
103
        if self.has_parts_list:
1✔
104
            # Base score for having parts_list + proximity/alignment bonus
105
            parts_list_bonus = 0.5
1✔
106
            pairing_score = (
1✔
107
                self.step_proximity_score + self.step_alignment_score
108
            ) / 2.0
109
            return parts_list_bonus + 0.5 * pairing_score
1✔
110
        return 0.3  # Lower base score for steps without parts list
1✔
111

112
    def sort_key(self) -> tuple[float, int]:
1✔
113
        """Return a tuple for sorting candidates.
114

115
        We prefer:
116
        1. Higher overall scores (better StepNumber-PartsList-Diagram match)
117
        2. Lower step number values (to break ties and maintain order)
118
        """
119
        return (-self.overall_score(), self.step_value)
1✔
120

121

122
class StepClassifier(LabelClassifier):
1✔
123
    """Classifier for complete Step structures."""
124

125
    output = "step"
1✔
126
    requires = frozenset(
1✔
127
        {
128
            "arrow",
129
            "step_number",
130
            "parts_list",
131
            "diagram",
132
            "rotation_symbol",
133
            "subassembly",
134
            "substep",
135
            "preview",
136
        }
137
    )
138

139
    def declare_constraints(
1✔
140
        self, model: ConstraintModel, result: ClassificationResult
141
    ) -> None:
142
        """Declare constraints for step candidates.
143

144
        Constraints:
145
        1. Uniqueness by step_value: At most one step per step_value (e.g., only
146
           one "Step 1"). The solver picks the highest-scoring candidate.
147

148
        2. No orphaned elements: If any of these elements are selected, at least
149
           one step must also be selected:
150
           - arrows (point from subassembly callouts to main diagram)
151
           - rotation_symbols (indicate model rotation)
152
           - subassemblies (callout boxes within steps)
153
           - substeps (mini-steps within a main step)
154
           - diagrams (the main instruction graphic)
155

156
        Note: Child uniqueness constraints (each step_number/parts_list can only
157
        be used by one parent) are handled automatically by
158
        SchemaConstraintGenerator.add_child_uniqueness_constraints().
159
        """
160
        candidates = result.get_candidates(self.output)
1✔
161
        candidates_in_model = [c for c in candidates if model.has_candidate(c)]
1✔
162

163
        # === Constraint 1: Uniqueness by step_value ===
164
        if candidates_in_model:
1✔
165
            by_value: dict[int, list[Candidate]] = {}
1✔
166

167
            for cand in candidates_in_model:
1✔
168
                if isinstance(cand.score_details, _StepScore):
1✔
169
                    score = cand.score_details
1✔
170
                    step_value = score.step_value
1✔
171
                    if step_value not in by_value:
1✔
172
                        by_value[step_value] = []
1✔
173
                    by_value[step_value].append(cand)
1✔
174

175
            for step_value, cands in by_value.items():
1✔
176
                if len(cands) > 1:
1✔
177
                    model.at_most_one_of(cands)
1✔
178
                    log.debug(
1✔
179
                        "[step] Uniqueness constraint: at most one step with "
180
                        "value=%d (%d candidates)",
181
                        step_value,
182
                        len(cands),
183
                    )
184

185
        # === Constraint 2: No orphaned elements ===
186
        # These elements must have a parent step if they are selected.
187
        # This prevents arrows, diagrams, etc. from being built without a step.
188
        #
189
        # TODO: Consider moving this to SchemaConstraintGenerator using
190
        # __constraint_rules__['field']['no_orphans'] = True on the Step schema.
191
        # This would centralize the constraint logic and make it declarative.
192
        # However, not all child elements should have no-orphan constraints
193
        # (e.g., diagrams can exist standalone on cover pages).
194
        orphan_labels = [
1✔
195
            "arrow",
196
            "rotation_symbol",
197
            "subassembly",
198
            "substep",
199
            "diagram",
200
        ]
201

202
        for label in orphan_labels:
1✔
203
            child_candidates = [
1✔
204
                c for c in result.get_candidates(label) if model.has_candidate(c)
205
            ]
206
            if child_candidates and candidates_in_model:
1✔
207
                model.if_any_selected_then_one_of(child_candidates, candidates_in_model)
1✔
208
                log.debug(
1✔
209
                    "[step] No-orphan constraint: if any %s selected, "
210
                    "at least one step must be selected (%d %s, %d steps)",
211
                    label,
212
                    len(child_candidates),
213
                    label,
214
                    len(candidates_in_model),
215
                )
216

217
    def _score(self, result: ClassificationResult) -> None:
1✔
218
        """Score step pairings and create candidates."""
219
        page_data = result.page_data
1✔
220

221
        # Get step number and parts list candidates (not constructed elements)
222
        step_number_candidates = result.get_scored_candidates("step_number")
1✔
223

224
        if not step_number_candidates:
1✔
225
            return
1✔
226

227
        # Get parts_list candidates
228
        parts_list_candidates = result.get_scored_candidates("parts_list")
1✔
229

230
        log.debug(
1✔
231
            "[step] page=%s step_candidates=%d parts_list_candidates=%d",
232
            page_data.page_number,
233
            len(step_number_candidates),
234
            len(parts_list_candidates),
235
        )
236

237
        # Create all possible Step candidates for pairings (without diagrams initially)
238
        # Deduplication happens at build time, not scoring time, so that
239
        # diagram assignment can be re-evaluated as blocks get claimed.
240
        all_candidates: Sequence[Candidate] = []
1✔
241
        for step_candidate in step_number_candidates:
1✔
242
            # Create candidates for this StepNumber paired with each PartsList
243
            for parts_list_candidate in parts_list_candidates:
1✔
244
                candidate = self._create_step_candidate(
1✔
245
                    step_candidate, parts_list_candidate, result
246
                )
247
                if candidate:
1✔
248
                    all_candidates.append(candidate)
1✔
249

250
            # Also create a candidate with no PartsList (fallback)
251
            candidate = self._create_step_candidate(step_candidate, None, result)
1✔
252
            if candidate:
1✔
253
                all_candidates.append(candidate)
1✔
254

255
        # Add all candidates to result (deduplication happens at build time)
256
        for candidate in all_candidates:
1✔
257
            result.add_candidate(candidate)
1✔
258

259
        log.debug(
1✔
260
            "[step] Created %d step candidates",
261
            len(all_candidates),
262
        )
263

264
    def build_all(self, result: ClassificationResult) -> list[LegoPageElements]:
1✔
265
        """Build all Step elements with coordinated element assignment.
266

267
        Build Order (phases run sequentially):
268
        - Phase 1: Rotation symbols - claim small circular arrow graphics
269
        - Phase 2: Parts lists - claim part count blocks and part images
270
        - Phase 3: Arrows - claim arrowhead triangles and shaft lines/rectangles
271
        - Phase 4: Subassemblies & Previews - claim white callout boxes
272
        - Phase 5: Steps - build step_number + parts_list pairs (no diagram yet)
273
        - Phase 6: Diagrams - build remaining unclaimed diagram regions
274
        - Phase 7: Assign diagrams to steps via Hungarian matching
275
        - Phase 8: Assign subassemblies to steps via Hungarian matching
276
        - Phase 9: Finalize steps - add arrows, compute final bboxes
277
        - Phase 10: Assign rotation symbols to steps via Hungarian matching
278

279
        The order matters because elements compete for the same Drawing blocks.
280
        Earlier phases claim blocks first, preventing later phases from using them.
281
        For example:
282
        - Arrows must build before subassemblies so arrow shafts aren't consumed
283
          by subassembly white boxes
284
        - Rotation symbols must build before diagrams so small circular graphics
285
          aren't incorrectly clustered into diagrams
286

287
        Step uniqueness is enforced by the constraint solver when enabled
288
        (via declare_constraints). When solver is disabled, deduplication
289
        falls back to greedy matching by step_value.
290

291
        Returns:
292
            List of successfully constructed Step elements, sorted by step number
293
        """
294
        # Pre-check: Get step candidates early to decide if we should build
295
        # supporting elements (rotation symbols, arrows, etc.)
296
        # On INFO pages with no steps, building these would create orphaned elements.
297
        all_step_candidates = result.get_scored_candidates("step")
1✔
298
        has_step_candidates = bool(all_step_candidates)
1✔
299

300
        # Phase 1: Build all rotation symbols BEFORE steps.
301
        # This allows rotation symbols to claim their Drawing blocks first,
302
        # preventing them from being incorrectly clustered into diagrams.
303
        #
304
        # IMPORTANT: Only build rotation symbols if there are step candidates.
305
        # On INFO pages, there are no steps to assign rotation symbols to.
306
        if has_step_candidates:
1✔
307
            for rs_candidate in result.get_scored_candidates("rotation_symbol"):
1✔
308
                try:
1✔
309
                    result.build(rs_candidate)
1✔
310
                    log.debug(
1✔
311
                        "[step] Built rotation symbol at %s (score=%.2f)",
312
                        rs_candidate.bbox,
313
                        rs_candidate.score,
314
                    )
UNCOV
315
                except Exception as e:
×
UNCOV
316
                    log.debug(
×
317
                        "[step] Failed to construct rotation_symbol "
318
                        "candidate at %s: %s",
319
                        rs_candidate.bbox,
320
                        e,
321
                    )
322
        else:
323
            log.debug("[step] Skipping rotation symbol build - no step candidates")
1✔
324

325
        # Phase 2: Build all parts lists BEFORE steps.
326
        # This allows parts lists to claim their Drawing blocks first,
327
        # preventing them from being claimed by subassemblies.
328
        for pl_candidate in result.get_scored_candidates("parts_list"):
1✔
329
            try:
1✔
330
                result.build(pl_candidate)
1✔
331
                log.debug(
1✔
332
                    "[step] Built parts_list at %s (score=%.2f)",
333
                    pl_candidate.bbox,
334
                    pl_candidate.score,
335
                )
UNCOV
336
            except Exception as e:
×
UNCOV
337
                log.debug(
×
338
                    "[step] Failed to construct parts_list candidate at %s: %s",
339
                    pl_candidate.bbox,
340
                    e,
341
                )
342

343
        # Phase 3: Build all arrows BEFORE subassemblies.
344
        # Arrows connect subassembly callout boxes to main diagrams. The arrow
345
        # shaft may be a thin rectangle that overlaps the subassembly's white
346
        # box. By building arrows first, they claim the shaft blocks before
347
        # subassemblies can consume them.
348
        #
349
        # IMPORTANT: Only build arrows if there are step candidates.
350
        # On INFO pages, there are no steps to assign arrows to, so building
351
        # them would create orphaned elements that trigger validation errors.
352
        # Note: has_step_candidates was computed in the pre-check.
353
        if has_step_candidates:
1✔
354
            for arrow_candidate in result.get_scored_candidates("arrow"):
1✔
355
                try:
1✔
356
                    result.build(arrow_candidate)
1✔
357
                    log.debug(
1✔
358
                        "[step] Built arrow at %s (score=%.2f)",
359
                        arrow_candidate.bbox,
360
                        arrow_candidate.score,
361
                    )
UNCOV
362
                except Exception as e:
×
UNCOV
363
                    log.debug(
×
364
                        "[step] Failed to construct arrow candidate at %s: %s",
365
                        arrow_candidate.bbox,
366
                        e,
367
                    )
368
        else:
369
            log.debug("[step] Skipping arrow build - no step candidates on page")
1✔
370

371
        # Phase 4: Build subassemblies and previews BEFORE steps.
372
        # Both subassemblies and previews are white boxes with diagrams inside.
373
        # We combine them and build in score order so the higher-scoring
374
        # candidate claims the white box first. When a candidate is built,
375
        # its source_blocks are marked as consumed, causing any competing
376
        # candidate using the same blocks to fail.
377
        #
378
        # This allows subassemblies (which have step_count labels like "2x")
379
        # to be distinguished from previews (which appear before steps).
380
        subassembly_candidates = result.get_scored_candidates("subassembly")
1✔
381
        preview_candidates = result.get_scored_candidates("preview")
1✔
382

383
        # Combine and sort by score (highest first)
384
        combined_candidates = list(subassembly_candidates) + list(preview_candidates)
1✔
385
        combined_candidates.sort(key=lambda c: c.score, reverse=True)
1✔
386

387
        for candidate in combined_candidates:
1✔
388
            try:
1✔
389
                result.build(candidate)
1✔
390
                log.debug(
1✔
391
                    "[step] Built %s at %s (score=%.2f)",
392
                    candidate.label,
393
                    candidate.bbox,
394
                    candidate.score,
395
                )
396
            except Exception as e:
1✔
397
                log.debug(
1✔
398
                    "[step] Failed to construct %s candidate at %s: %s",
399
                    candidate.label,
400
                    candidate.bbox,
401
                    e,
402
                )
403

404
        # Phase 5: Build Step candidates
405
        # The solver (via get_scored_candidates) returns only solver-selected
406
        # candidates (at most one per step_value), so no deduplication needed.
407
        # Filter out candidates that look like substeps (small step numbers with
408
        # a significant gap from main step numbers). These are handled by
409
        # SubStepClassifier instead.
410
        # Steps are built as "partial" - just step_number + parts_list.
411
        # Diagrams and subassemblies are assigned later via Hungarian matching.
412
        # Note: all_step_candidates was already fetched in pre-check phase.
413
        page_level_step_candidates = self._filter_page_level_step_candidates(
1✔
414
            all_step_candidates
415
        )
416

417
        steps: Sequence[Step] = []
1✔
418
        # Track built step values to detect duplicates (should never happen
419
        # since solver enforces uniqueness via at_most_one_of constraints)
420
        built_step_values: set[int] = set()
1✔
421

422
        for step_candidate in page_level_step_candidates:
1✔
423
            # Get step value from score
424
            score = step_candidate.score_details
1✔
425
            if not isinstance(score, _StepScore):
1✔
UNCOV
426
                continue
×
427

428
            # Solver enforces uniqueness - we should never see duplicates
429
            assert score.step_value not in built_step_values, (
1✔
430
                f"Duplicate step {score.step_value} candidate - solver should "
431
                f"have enforced uniqueness"
432
            )
433

434
            try:
1✔
435
                elem = result.build(step_candidate)
1✔
436
                assert isinstance(elem, Step)
1✔
437
                steps.append(elem)
1✔
438
                built_step_values.add(score.step_value)
1✔
439
                log.debug(
1✔
440
                    "[step] Built partial step %d (parts_list=%s, score=%.2f)",
441
                    score.step_value,
442
                    "yes" if score.parts_list_candidate is not None else "no",
443
                    step_candidate.score,
444
                )
445
            except Exception as e:
1✔
446
                log.debug(
1✔
447
                    "[step] Failed to construct step %d candidate: %s",
448
                    score.step_value,
449
                    e,
450
                )
451

452
        # Sort steps by their step_number value
453
        steps.sort(key=lambda step: step.step_number.value)
1✔
454

455
        # Phase 6: Build diagrams (not yet claimed by subassemblies)
456
        # Phase 7: Assign diagrams to steps using Hungarian matching
457
        # Collect available diagram candidates (not yet claimed by subassemblies)
458
        available_diagrams: list[Diagram] = []
1✔
459
        for diag_candidate in result.get_scored_candidates("diagram"):
1✔
460
            if result.get_constructed(diag_candidate) is None:
1✔
461
                # Build the diagram
462
                try:
1✔
463
                    diag_elem = result.build(diag_candidate)
1✔
464
                    assert isinstance(diag_elem, Diagram)
1✔
465
                    available_diagrams.append(diag_elem)
1✔
466
                except Exception as e:
1✔
467
                    log.debug(
1✔
468
                        "[step] Failed to build diagram at %s: %s",
469
                        diag_candidate.bbox,
470
                        e,
471
                    )
472
            elif isinstance(result.get_constructed(diag_candidate), Diagram):
1✔
473
                # Already built (claimed by subassembly) - skip
474
                pass
1✔
475

476
        log.debug(
1✔
477
            "[step] Available diagrams for step assignment: %d",
478
            len(available_diagrams),
479
        )
480

481
        # Collect divider bboxes for obstruction checking (used by multiple phases)
482
        divider_bboxes: list[BBox] = []
1✔
483
        for div_candidate in result.get_built_candidates("divider"):
1✔
484
            divider = result.get_constructed(div_candidate)
1✔
485
            assert divider is not None
1✔
486
            assert isinstance(divider, Divider)
1✔
487
            divider_bboxes.append(divider.bbox)
1✔
488

489
        # Assign diagrams to steps using Hungarian matching
490
        # Dividers prevent pairing across page divisions
491
        assign_diagrams_to_steps(steps, available_diagrams, divider_bboxes)
1✔
492

493
        # Phase 8: Assign subassemblies to steps using Hungarian matching
494
        # Collect built subassemblies
495
        subassemblies: Sequence[SubAssembly] = []
1✔
496
        for sa_candidate in result.get_built_candidates("subassembly"):
1✔
497
            subassembly = result.get_constructed(sa_candidate)
1✔
498
            assert subassembly is not None
1✔
499
            assert isinstance(subassembly, SubAssembly)
1✔
500
            subassemblies.append(subassembly)
1✔
501

502
        log.debug(
1✔
503
            "[step] Assigning %d subassemblies to %d steps (%d dividers for "
504
            "obstruction checking)",
505
            len(subassemblies),
506
            len(steps),
507
            len(divider_bboxes),
508
        )
509

510
        # Assign subassemblies to steps using Hungarian matching
511
        assign_subassemblies_to_steps(steps, subassemblies, divider_bboxes)
1✔
512

513
        # Phase 8b: Get unclaimed SubSteps as naked substeps
514
        # SubStepClassifier found small step number + diagram pairs.
515
        # SubAssemblyClassifier claimed those inside callout boxes.
516
        # The remaining ones become "naked" substeps for the main step.
517
        if steps:
1✔
518
            substeps = self._get_unclaimed_substeps(result)
1✔
519
            if substeps:
1✔
520
                # Assign all substeps to the single main step on the page
521
                # (In the future, we could use Hungarian matching if there are
522
                # multiple main steps, but for now this covers the common case)
523
                main_step = steps[0]  # Usually only one main step when substeps exist
1✔
524
                object.__setattr__(main_step, "substeps", substeps)
1✔
525
                log.debug(
1✔
526
                    "[step] Assigned %d substeps to step %d",
527
                    len(substeps),
528
                    main_step.step_number.value,
529
                )
530

531
        # Phase 9: Finalize steps - collect arrows and compute final bboxes
532
        page_bbox = result.page_data.bbox
1✔
533
        for step in steps:
1✔
534
            # Get arrows for this step (pass subassemblies for search region)
535
            arrows = self._get_arrows_for_step(
1✔
536
                step.step_number, step.diagram, step.subassemblies, result
537
            )
538

539
            # Compute final bbox including all components
540
            final_bbox = self._compute_step_bbox(
1✔
541
                step.step_number,
542
                step.parts_list,
543
                step.diagram,
544
                step.subassemblies,
545
                page_bbox,
546
            )
547

548
            # Update the step (Step is a Pydantic model, so we need to reassign)
549
            # We mutate in place by directly setting attributes
550
            object.__setattr__(step, "arrows", arrows)
1✔
551
            object.__setattr__(step, "bbox", final_bbox)
1✔
552

553
        # Phase 10: Assign rotation symbols to steps using Hungarian matching
554
        # Collect built rotation symbols
555
        rotation_symbols: list[RotationSymbol] = []
1✔
556
        for rs_candidate in result.get_built_candidates("rotation_symbol"):
1✔
557
            rs = result.get_constructed(rs_candidate)
1✔
558
            assert rs is not None
1✔
559
            assert isinstance(rs, RotationSymbol)
1✔
560
            rotation_symbols.append(rs)
1✔
561

562
        assign_rotation_symbols_to_steps(steps, rotation_symbols)
1✔
563

564
        log.debug(
1✔
565
            "[step] build_all complete: %d steps, %d rotation symbols assigned",
566
            len(steps),
567
            sum(1 for s in steps if s.rotation_symbol is not None),
568
        )
569

570
        # Cast for type compatibility with base class return type
571
        return list[LegoPageElements](steps)
1✔
572

573
    def _filter_page_level_step_candidates(
1✔
574
        self, candidates: Sequence[Candidate]
575
    ) -> Sequence[Candidate]:
576
        """Filter step candidates to exclude likely substep candidates.
577

578
        When a page has both low-numbered steps (1, 2, 3, 4) and high-numbered
579
        steps (338), with a significant gap between them, the low numbers are
580
        likely substeps rather than main steps. These are handled by
581
        SubStepClassifier instead.
582

583
        Args:
584
            candidates: All step candidates to filter
585

586
        Returns:
587
            Page-level step candidates (excluding substep candidates)
588
        """
589
        # Extract step number values from candidates
590
        items_with_values: list[tuple[int, Candidate]] = []
1✔
591
        for candidate in candidates:
1✔
592
            score = candidate.score_details
1✔
593
            if not isinstance(score, _StepScore):
1✔
UNCOV
594
                continue
×
595
            items_with_values.append((score.step_value, candidate))
1✔
596

597
        # Use generic filtering function to find page-level steps
598
        filtered_items = filter_subassembly_values(items_with_values)
1✔
599

600
        # If filtering occurred, return only page-level candidates
601
        if len(filtered_items) != len(items_with_values):
1✔
602
            page_level_values = {v for v, _ in filtered_items}
1✔
603
            page_level_candidates = [
1✔
604
                item for v, item in items_with_values if v in page_level_values
605
            ]
606
            log.debug(
1✔
607
                "[step] Filtered to page-level candidates: %s (excluded substeps: %s)",
608
                sorted(page_level_values),
609
                sorted(v for v, _ in items_with_values if v not in page_level_values),
610
            )
611
            return page_level_candidates
1✔
612

613
        # No filtering occurred, return all as page-level
614
        return candidates
1✔
615

616
    def _get_unclaimed_substeps(
1✔
617
        self,
618
        result: ClassificationResult,
619
    ) -> list[SubStep]:
620
        """Get SubStep elements that weren't claimed by a SubAssembly.
621

622
        SubStepClassifier creates candidates for all small step number +
623
        diagram pairs. SubAssemblyClassifier then claims those inside callout boxes.
624
        This method returns the remaining unclaimed SubSteps for use as
625
        "naked" substeps in a Step.
626

627
        The substeps are filtered to only include valid sequences starting from 1.
628
        This helps filter out PieceLengths or other misidentified elements.
629

630
        Args:
631
            result: Classification result
632

633
        Returns:
634
            List of SubStep elements that weren't claimed by any SubAssembly,
635
            sorted by step number value, and filtered to valid sequences from 1.
636
        """
637
        # Get all substep candidates
638
        substep_candidates = result.get_scored_candidates("substep")
1✔
639

640
        # Filter to only unclaimed candidates (not yet built)
641
        unclaimed_candidates = [
1✔
642
            c for c in substep_candidates if result.get_constructed(c) is None
643
        ]
644

645
        if not unclaimed_candidates:
1✔
646
            return []
1✔
647

648
        # Extract step values from candidates to filter BEFORE building
649
        def get_step_value(candidate: Candidate) -> int:
1✔
650
            """Get step value from a substep candidate's score."""
651
            score = candidate.score_details
1✔
652
            assert isinstance(score, _SubStepScore)
1✔
653

654
            return score.step_value
1✔
655

656
        # Filter candidates to valid sequential substeps (must start from 1)
657
        valid_candidates = filter_valid_substep_sequence(
1✔
658
            unclaimed_candidates, get_step_value
659
        )
660

661
        if not valid_candidates:
1✔
662
            log.debug("[step] No valid substep sequence found (no step 1)")
1✔
663
            return []
1✔
664

665
        # Now build only the filtered candidates
666
        substeps: Sequence[SubStep] = []
1✔
667
        for candidate in valid_candidates:
1✔
668
            try:
1✔
669
                substep_elem = result.build(candidate)
1✔
670
                assert isinstance(substep_elem, SubStep)
1✔
671
                substeps.append(substep_elem)
1✔
672
            except CandidateFailedError:
1✔
673
                # Already claimed - skip
674
                continue
1✔
675

676
        # Sort by step number value
677
        substeps.sort(key=lambda s: s.step_number.value)
1✔
678

679
        log.debug(
1✔
680
            "[step] Found %d unclaimed substeps (after sequence filtering)",
681
            len(substeps),
682
        )
683

684
        return substeps
1✔
685

686
    def build(self, candidate: Candidate, result: ClassificationResult) -> Step:
1✔
687
        """Construct a partial Step element from a single candidate.
688

689
        This creates a Step with just step_number and parts_list. The diagram,
690
        subassemblies, and arrows are assigned later in build_all() using
691
        Hungarian matching for optimal global assignment.
692
        """
693
        score = candidate.score_details
1✔
694
        assert isinstance(score, _StepScore)
1✔
695

696
        # Validate and extract step number from parent candidate
697
        step_num_candidate = score.step_number_candidate
1✔
698

699
        step_num_elem = result.build(step_num_candidate)
1✔
700
        assert isinstance(step_num_elem, StepNumber)
1✔
701
        step_num = step_num_elem
1✔
702

703
        # Validate and extract parts list from parent candidate (if present)
704
        parts_list = None
1✔
705
        if score.parts_list_candidate:
1✔
706
            parts_list_candidate = score.parts_list_candidate
1✔
707
            parts_list_elem = result.build(parts_list_candidate)
1✔
708
            assert isinstance(parts_list_elem, PartsList)
1✔
709
            parts_list = parts_list_elem
1✔
710

711
        # Create partial step - diagram, subassemblies, arrows assigned later
712
        initial_bbox = step_num.bbox
1✔
713
        if parts_list:
1✔
714
            initial_bbox = initial_bbox.union(parts_list.bbox)
1✔
715

716
        return Step(
1✔
717
            bbox=initial_bbox,
718
            step_number=step_num,
719
            parts_list=parts_list,
720
            diagram=None,
721
            rotation_symbol=None,
722
            arrows=[],
723
            subassemblies=[],
724
        )
725

726
    def _find_and_build_diagram_for_step(
1✔
727
        self,
728
        step_num: StepNumber,
729
        parts_list: PartsList | None,
730
        result: ClassificationResult,
731
    ) -> Diagram | None:
732
        """Find and build the best diagram for this step.
733

734
        This is called at build time, after rotation symbols and subassemblies
735
        have been built (in build_all Phases 1 and 3), so they've already
736
        claimed any images/diagrams they need. We filter to only consider
737
        diagram candidates that haven't been constructed yet.
738

739
        Args:
740
            step_num: The built step number element
741
            parts_list: The built parts list element (if any)
742
            result: Classification result containing diagram candidates
743

744
        Returns:
745
            The built Diagram element, or None if no suitable diagram found
746
        """
747
        # Get all non-failed, non-constructed diagram candidates
UNCOV
748
        diagram_candidates = result.get_scored_candidates("diagram")
×
749

750
        # Filter to only candidates that haven't been built yet
751
        # (subassemblies and rotation symbols built earlier may have claimed some)
752
        available_candidates = [
×
753
            c for c in diagram_candidates if result.get_constructed(c) is None
754
        ]
755

UNCOV
756
        if not available_candidates:
×
UNCOV
757
            log.debug(
×
758
                "[step] No diagram candidates available for step %d",
759
                step_num.value,
760
            )
761
            return None
×
762

763
        # Score each candidate based on position relative to step
UNCOV
764
        step_bbox = step_num.bbox
×
UNCOV
765
        best_candidate = None
×
UNCOV
766
        best_score = -float("inf")
×
767

768
        for candidate in available_candidates:
×
UNCOV
769
            score = self._score_step_diagram_pair(step_bbox, candidate.bbox)
×
770

UNCOV
771
            log.debug(
×
772
                "[step] Diagram candidate at %s for step %d: score=%.2f",
773
                candidate.bbox,
774
                step_num.value,
775
                score,
776
            )
777

UNCOV
778
            if score > best_score:
×
UNCOV
779
                best_score = score
×
UNCOV
780
                best_candidate = candidate
×
781

UNCOV
782
        if best_candidate is None or best_score < 0.2:
×
UNCOV
783
            log.debug(
×
784
                "[step] No suitable diagram found for step %d (best_score=%.2f)",
785
                step_num.value,
786
                best_score,
787
            )
UNCOV
788
            return None
×
789

790
        # Build the diagram
UNCOV
791
        try:
×
UNCOV
792
            diagram_elem = result.build(best_candidate)
×
UNCOV
793
            assert isinstance(diagram_elem, Diagram)
×
UNCOV
794
            log.debug(
×
795
                "[step] Built diagram at %s for step %d (score=%.2f)",
796
                diagram_elem.bbox,
797
                step_num.value,
798
                best_score,
799
            )
800
            return diagram_elem
×
UNCOV
801
        except CandidateFailedError as e:
×
UNCOV
802
            log.debug(
×
803
                "[step] Failed to build diagram for step %d: %s",
804
                step_num.value,
805
                e,
806
            )
UNCOV
807
            return None
×
808

809
    def _create_step_candidate(
1✔
810
        self,
811
        step_candidate: Candidate,
812
        parts_list_candidate: Candidate | None,
813
        result: ClassificationResult,
814
    ) -> Candidate | None:
815
        """Create a Step candidate (without diagram assignment).
816

817
        Diagrams are found at build time, not during scoring, to allow
818
        rotation symbols to claim small images first.
819

820
        Args:
821
            step_candidate: The StepNumber candidate for this step
822
            parts_list_candidate: The PartsList candidate to pair with (or None)
823
            result: Classification result
824

825
        Returns:
826
            The created Candidate with score but no construction, or None if
827
            the step number value cannot be extracted.
828
        """
829
        ABOVE_EPS = 2.0  # Small epsilon for "above" check
1✔
830
        ALIGNMENT_THRESHOLD_MULTIPLIER = 1.0  # Max horizontal offset
1✔
831
        DISTANCE_THRESHOLD_MULTIPLIER = 1.0  # Max vertical distance
1✔
832

833
        # Extract step number value from the candidate's score
834
        score_details = step_candidate.score_details
1✔
835
        if not isinstance(score_details, StepNumberScore):
1✔
UNCOV
836
            return None
×
837
        step_value = score_details.step_value
1✔
838
        if step_value == 0:
1✔
UNCOV
839
            return None
×
840

841
        step_bbox = step_candidate.bbox
1✔
842
        parts_list_bbox = parts_list_candidate.bbox if parts_list_candidate else None
1✔
843

844
        # Calculate pairing scores if there's a parts_list above the step
845
        proximity_score = 0.0
1✔
846
        alignment_score = 0.0
1✔
847

848
        if (
1✔
849
            parts_list_bbox is not None
850
            and parts_list_bbox.y1 <= step_bbox.y0 + ABOVE_EPS
851
        ):
852
            # Calculate distance (how far apart vertically)
853
            distance = step_bbox.y0 - parts_list_bbox.y1
1✔
854

855
            # Calculate proximity score
856
            max_distance = step_bbox.height * DISTANCE_THRESHOLD_MULTIPLIER
1✔
857
            if max_distance > 0:
1✔
858
                proximity_score = max(0.0, 1.0 - (distance / max_distance))
1✔
859

860
            # Calculate alignment score (how well left edges align)
861
            max_alignment_diff = step_bbox.width * ALIGNMENT_THRESHOLD_MULTIPLIER
1✔
862
            left_diff = abs(parts_list_bbox.x0 - step_bbox.x0)
1✔
863
            if max_alignment_diff > 0:
1✔
864
                alignment_score = max(0.0, 1.0 - (left_diff / max_alignment_diff))
1✔
865

866
        # Create score object with candidate references
867
        # Diagrams are found at build time, not during scoring
868
        score = _StepScore(
1✔
869
            step_value=step_value,
870
            step_number_candidate=step_candidate,
871
            parts_list_candidate=parts_list_candidate,
872
            has_parts_list=parts_list_candidate is not None,
873
            step_proximity_score=proximity_score,
874
            step_alignment_score=alignment_score,
875
        )
876

877
        # Calculate combined bbox for the candidate (without diagram)
878
        combined_bbox = step_bbox
1✔
879
        if parts_list_bbox:
1✔
880
            combined_bbox = BBox.union(combined_bbox, parts_list_bbox)
1✔
881

882
        # Create candidate
883
        return Candidate(
1✔
884
            bbox=combined_bbox,
885
            label="step",
886
            score=score.overall_score(),
887
            score_details=score,
888
            source_blocks=[],
889
        )
890

891
    def _score_step_diagram_pair(
1✔
892
        self,
893
        step_bbox: BBox,
894
        diagram_bbox: BBox,
895
    ) -> float:
896
        """Score how well a diagram matches a step.
897

898
        Diagrams are typically positioned to the right of and/or below the step
899
        number. This method scores based on:
900
        - Horizontal position: prefer diagrams to the right, penalize left
901
        - Vertical position: prefer diagrams below the step header
902
        - Distance: closer is better
903

904
        Args:
905
            step_bbox: The step number bounding box
906
            diagram_bbox: The diagram bounding box to score
907

908
        Returns:
909
            Score between 0.0 and 1.0 (higher is better match)
910
        """
911
        # Reference point: bottom-right of step number
UNCOV
912
        ref_x = step_bbox.x1
×
UNCOV
913
        ref_y = step_bbox.y1
×
914

915
        # TODO Move all these constants into config, or make them adaptive?
916

917
        # Horizontal score
918
        # Diagrams to the right are preferred, but allow some overlap
UNCOV
919
        x_offset = diagram_bbox.x0 - ref_x
×
920

UNCOV
921
        if x_offset >= -50:
×
922
            # Diagram starts to the right or slightly overlapping - good
923
            # Score decreases slightly with distance to the right
UNCOV
924
            x_score = max(0.5, 1.0 - abs(x_offset) / 400.0)
×
UNCOV
925
        elif x_offset >= -200:
×
926
            # Diagram is moderately to the left - acceptable
UNCOV
927
            x_score = 0.3 + 0.2 * (1.0 + x_offset / 200.0)
×
928
        else:
929
            # Diagram is far to the left - poor match
UNCOV
930
            x_score = max(0.1, 0.3 + x_offset / 400.0)
×
931

932
        # Vertical score
933
        # Diagrams below the step header are preferred
UNCOV
934
        y_offset = diagram_bbox.y0 - ref_y
×
935

UNCOV
936
        if y_offset >= -30:
×
937
            # Diagram starts below or slightly overlapping - good
938
            # Score decreases with vertical distance
UNCOV
939
            y_score = max(0.3, 1.0 - abs(y_offset) / 300.0)
×
UNCOV
940
        elif y_offset >= -100:
×
941
            # Diagram is moderately above - less good but acceptable
UNCOV
942
            y_score = 0.2 + 0.3 * (1.0 + y_offset / 100.0)
×
943
        else:
944
            # Diagram is far above - poor match
945
            y_score = max(0.05, 0.2 + y_offset / 300.0)
×
946

947
        # Combined score - weight both dimensions equally
948
        score = 0.5 * x_score + 0.5 * y_score
×
949

UNCOV
950
        return score
×
951

952
    def _get_rotation_symbol_for_step(
1✔
953
        self,
954
        step_bbox: BBox,
955
        result: ClassificationResult,
956
    ) -> RotationSymbol | None:
957
        """Find an already-built rotation symbol within this step's area.
958

959
        Rotation symbols are built by PageClassifier before steps are processed.
960
        This method finds the already-built rotation symbol that falls within
961
        the step's bounding box.
962

963
        Args:
964
            step_bbox: The step's bounding box (including step_num, parts_list,
965
                and diagram)
966
            result: Classification result containing rotation symbol candidates
967

968
        Returns:
969
            Single RotationSymbol element for this step, or None if not found
970
        """
971
        # Get rotation_symbol candidates - they should already be built
972
        # by PageClassifier. Use get_built_candidates to only get successfully
973
        # constructed rotation symbols.
UNCOV
974
        rotation_symbol_candidates = result.get_built_candidates("rotation_symbol")
×
975

UNCOV
976
        log.debug(
×
977
            "[step] Looking for rotation symbols in step bbox %s, "
978
            "found %d built candidates",
979
            step_bbox,
980
            len(rotation_symbol_candidates),
981
        )
982

UNCOV
983
        if not rotation_symbol_candidates:
×
UNCOV
984
            return None
×
985

986
        # Find best-scoring rotation symbol overlapping the step's bbox
UNCOV
987
        overlapping_candidates = filter_overlapping(
×
988
            rotation_symbol_candidates, step_bbox
989
        )
UNCOV
990
        best_candidate = find_best_scoring(overlapping_candidates)
×
991

UNCOV
992
        if best_candidate:
×
UNCOV
993
            rotation_symbol = result.get_constructed(best_candidate)
×
UNCOV
994
            if rotation_symbol is not None:
×
UNCOV
995
                assert isinstance(rotation_symbol, RotationSymbol)
×
UNCOV
996
                log.debug(
×
997
                    "[step] Found rotation symbol at %s (score=%.2f)",
998
                    rotation_symbol.bbox,
999
                    best_candidate.score,
1000
                )
UNCOV
1001
                return rotation_symbol
×
1002

UNCOV
1003
        log.debug("[step] No rotation symbol found in step bbox %s", step_bbox)
×
UNCOV
1004
        return None
×
1005

1006
    def _get_arrows_for_step(
1✔
1007
        self,
1008
        step_num: StepNumber,
1009
        diagram: Diagram | None,
1010
        subassemblies: Sequence[SubAssembly],
1011
        result: ClassificationResult,
1012
    ) -> list[Arrow]:
1013
        """Collect already-built arrows that belong to this step.
1014

1015
        Arrows are built in Phase 3 of build_all(), before subassemblies.
1016
        This method finds arrows that are positioned near the step's diagram
1017
        and assigns them to this step.
1018

1019
        Typically these are arrows pointing from subassembly callout boxes
1020
        to the main diagram. The search region includes both the diagram and
1021
        any subassemblies assigned to this step.
1022

1023
        Args:
1024
            step_num: The step number element
1025
            diagram: The diagram element (if any)
1026
            subassemblies: List of subassemblies assigned to this step
1027
            result: Classification result containing arrow candidates
1028

1029
        Returns:
1030
            List of Arrow elements for this step
1031
        """
1032
        # Get already-built arrows (built in Phase 3)
1033
        arrow_candidates = result.get_built_candidates("arrow")
1✔
1034

1035
        log.debug(
1✔
1036
            "[step] Looking for arrows for step %d, found %d built arrows",
1037
            step_num.value,
1038
            len(arrow_candidates),
1039
        )
1040

1041
        if not arrow_candidates:
1✔
1042
            return []
1✔
1043

1044
        # Build search region from diagram and subassemblies
1045
        # Arrows typically connect subassemblies to the main diagram, so we need
1046
        # to search both areas
1047
        search_bboxes: list[BBox] = []
1✔
1048
        if diagram:
1✔
1049
            search_bboxes.append(diagram.bbox)
1✔
1050
        for sa in subassemblies:
1✔
1051
            search_bboxes.append(sa.bbox)
1✔
1052

1053
        # Fallback to step number bbox if no diagram or subassemblies
1054
        if not search_bboxes:
1✔
UNCOV
1055
            search_bboxes.append(step_num.bbox)
×
1056

1057
        # Compute union of all components and expand
1058
        search_bbox = BBox.union_all(search_bboxes)
1✔
1059

1060
        # Expand search region to catch arrows near the components
1061
        # Use a larger margin since arrows can extend further
1062
        search_region = search_bbox.expand(100.0)
1✔
1063

1064
        log.debug(
1✔
1065
            "[step] Arrow search region for step %d: %s",
1066
            step_num.value,
1067
            search_region,
1068
        )
1069

1070
        # Find arrows within or overlapping the search region
1071
        arrows: list[Arrow] = []
1✔
1072
        overlapping_candidates = filter_overlapping(arrow_candidates, search_region)
1✔
1073

1074
        for candidate in overlapping_candidates:
1✔
1075
            arrow = result.get_constructed(candidate)
1✔
1076
            assert arrow is not None
1✔
1077
            assert isinstance(arrow, Arrow)
1✔
1078
            log.debug(
1✔
1079
                "[step]   Arrow at %s overlaps search region (score=%.2f)",
1080
                candidate.bbox,
1081
                candidate.score,
1082
            )
1083
            arrows.append(arrow)
1✔
1084

1085
        log.debug(
1✔
1086
            "[step] Found %d arrows for step %d",
1087
            len(arrows),
1088
            step_num.value,
1089
        )
1090
        return arrows
1✔
1091

1092
    def _get_subassemblies_for_step(
1✔
1093
        self,
1094
        step_num: StepNumber,
1095
        diagram: Diagram | None,
1096
        result: ClassificationResult,
1097
    ) -> list[SubAssembly]:
1098
        """Get already-built subassemblies that belong to this step.
1099

1100
        Subassemblies are built in build_all() before steps. This method finds
1101
        subassemblies that are positioned near this step's diagram and haven't
1102
        been claimed by another step yet.
1103

1104
        Args:
1105
            step_num: The step number element
1106
            diagram: The diagram element (if any)
1107
            result: Classification result containing subassembly candidates
1108

1109
        Returns:
1110
            List of SubAssembly elements for this step
1111
        """
1112
        # Get all built subassemblies
1113
        all_subassemblies: Sequence[SubAssembly] = []
×
1114
        for sa_candidate in result.get_built_candidates("subassembly"):
×
1115
            sa = result.get_constructed(sa_candidate)
×
1116
            assert sa is not None
×
UNCOV
1117
            assert isinstance(sa, SubAssembly)
×
UNCOV
1118
            all_subassemblies.append(sa)
×
1119

1120
        log.debug(
×
1121
            "[step] Looking for subassemblies for step %d, found %d built",
1122
            step_num.value,
1123
            len(all_subassemblies),
1124
        )
1125

UNCOV
1126
        if not all_subassemblies:
×
1127
            return []
×
1128

1129
        # Determine search region based on step_number and diagram
UNCOV
1130
        reference_bbox = diagram.bbox.union(step_num.bbox) if diagram else step_num.bbox
×
1131

UNCOV
1132
        page_bbox = result.page_data.bbox
×
1133

1134
        # Expand search region around the reference area
1135
        # Horizontally: search the full page width
1136
        # Vertically: search a region around the reference bbox
UNCOV
1137
        vertical_margin = reference_bbox.height
×
UNCOV
1138
        search_region = BBox(
×
1139
            x0=page_bbox.x0,
1140
            y0=max(page_bbox.y0, reference_bbox.y0 - vertical_margin),
1141
            x1=page_bbox.x1,
1142
            y1=min(page_bbox.y1, reference_bbox.y1 + vertical_margin),
1143
        )
1144

UNCOV
1145
        log.debug(
×
1146
            "[step] SubAssembly search region for step %d: %s",
1147
            step_num.value,
1148
            search_region,
1149
        )
1150

1151
        # Find subassemblies that overlap the search region
UNCOV
1152
        subassemblies: Sequence[SubAssembly] = []
×
UNCOV
1153
        for subassembly in all_subassemblies:
×
UNCOV
1154
            if subassembly.bbox.overlaps(search_region):
×
UNCOV
1155
                log.debug(
×
1156
                    "[step]   SubAssembly at %s overlaps search region",
1157
                    subassembly.bbox,
1158
                )
UNCOV
1159
                subassemblies.append(subassembly)
×
1160

UNCOV
1161
        log.debug(
×
1162
            "[step] Found %d subassemblies for step %d",
1163
            len(subassemblies),
1164
            step_num.value,
1165
        )
UNCOV
1166
        return subassemblies
×
1167

1168
    def _compute_step_bbox(
1✔
1169
        self,
1170
        step_num: StepNumber,
1171
        parts_list: PartsList | None,
1172
        diagram: Diagram | None,
1173
        subassemblies: Sequence[SubAssembly],
1174
        page_bbox: BBox,
1175
    ) -> BBox:
1176
        """Compute the overall bounding box for the Step.
1177

1178
        This encompasses the step number, parts list (if any), diagram (if any),
1179
        and all subassemblies.
1180
        The result is clipped to the page bounds to handle elements that extend
1181
        slightly off-page (e.g., arrows in diagrams).
1182

1183
        Args:
1184
            step_num: The step number element
1185
            parts_list: The parts list (if any)
1186
            diagram: The diagram element (if any)
1187
            subassemblies: List of subassemblies for this step
1188
            page_bbox: The page bounding box to clip to
1189

1190
        Returns:
1191
            Combined bounding box, clipped to page bounds
1192
        """
1193
        bboxes = [step_num.bbox]
1✔
1194
        if parts_list:
1✔
1195
            bboxes.append(parts_list.bbox)
1✔
1196
        if diagram:
1✔
1197
            bboxes.append(diagram.bbox)
1✔
1198
        for sa in subassemblies:
1✔
1199
            bboxes.append(sa.bbox)
1✔
1200

1201
        return BBox.union_all(bboxes).clip_to(page_bbox)
1✔
1202

1203

1204
def assign_diagrams_to_steps(
1✔
1205
    steps: Sequence[Step],
1206
    diagrams: list[Diagram],
1207
    divider_bboxes: Sequence[BBox] = (),
1208
    max_distance: float = 500.0,
1209
) -> None:
1210
    """Assign diagrams to steps using Hungarian algorithm.
1211

1212
    Uses the shared pairing module to find optimal step number to diagram
1213
    pairings based on:
1214
    - Position: step number should be in top-left of diagram
1215
    - Distance: closer is better
1216
    - Dividers: pairings should not cross divider lines
1217

1218
    This function mutates the Step objects in place, setting their diagram
1219
    attribute.
1220

1221
    Args:
1222
        steps: List of Step objects to assign diagrams to
1223
        diagrams: List of Diagram objects to assign
1224
        divider_bboxes: Sequence of divider bounding boxes to check for crossing
1225
        max_distance: Maximum distance for a valid assignment.
1226
    """
1227
    if not steps or not diagrams:
1✔
1228
        log.debug(
1✔
1229
            "[step] No diagram assignment needed: steps=%d, diagrams=%d",
1230
            len(steps),
1231
            len(diagrams),
1232
        )
1233
        return
1✔
1234

1235
    log.debug(
1✔
1236
        "[step] Running Hungarian matching for diagrams: %d steps, %d diagrams",
1237
        len(steps),
1238
        len(diagrams),
1239
    )
1240

1241
    # Configure pairing
1242
    config = PairingConfig(
1✔
1243
        max_distance=max_distance,
1244
        position_weight=0.5,
1245
        distance_weight=0.5,
1246
        check_dividers=True,
1247
        top_left_tolerance=100.0,
1248
    )
1249

1250
    # Extract bboxes
1251
    step_bboxes = [s.step_number.bbox for s in steps]
1✔
1252
    diagram_bboxes = [d.bbox for d in diagrams]
1✔
1253

1254
    # Find optimal pairings using shared logic
1255
    pairings = find_optimal_pairings(
1✔
1256
        step_bboxes, diagram_bboxes, config, divider_bboxes
1257
    )
1258

1259
    # Assign diagrams to steps based on the matching
1260
    for pairing in pairings:
1✔
1261
        step = steps[pairing.step_index]
1✔
1262
        diag = diagrams[pairing.diagram_index]
1✔
1263
        # Use object.__setattr__ because Step is a frozen Pydantic model
1264
        object.__setattr__(step, "diagram", diag)
1✔
1265
        log.debug(
1✔
1266
            "[step] Assigned diagram at %s to step %d (cost=%.2f)",
1267
            diag.bbox,
1268
            step.step_number.value,
1269
            pairing.cost,
1270
        )
1271

1272

1273
def assign_subassemblies_to_steps(
1✔
1274
    steps: Sequence[Step],
1275
    subassemblies: Sequence[SubAssembly],
1276
    divider_bboxes: Sequence[BBox] = (),
1277
    max_distance: float = 400.0,
1278
) -> None:
1279
    """Assign subassemblies to steps using Hungarian algorithm.
1280

1281
    Uses optimal bipartite matching to pair subassemblies with steps based on:
1282
    - Distance from step's diagram to subassembly (closer is better)
1283
    - No divider between the subassembly and the step's diagram
1284

1285
    This function mutates the Step objects in place, adding to their
1286
    subassemblies list.
1287

1288
    Args:
1289
        steps: List of Step objects to assign subassemblies to
1290
        subassemblies: List of SubAssembly objects to assign
1291
        divider_bboxes: Sequence of divider bounding boxes for obstruction checking
1292
        max_distance: Maximum distance for a valid assignment
1293
    """
1294
    if not steps or not subassemblies:
1✔
1295
        log.debug(
1✔
1296
            "[step] No subassembly assignment needed: steps=%d, subassemblies=%d",
1297
            len(steps),
1298
            len(subassemblies),
1299
        )
1300
        return
1✔
1301

1302
    n_steps = len(steps)
1✔
1303
    n_subassemblies = len(subassemblies)
1✔
1304

1305
    log.debug(
1✔
1306
        "[step] Running Hungarian matching for subassemblies: "
1307
        "%d steps, %d subassemblies",
1308
        n_steps,
1309
        n_subassemblies,
1310
    )
1311

1312
    # Build cost matrix: rows = subassemblies, cols = steps
1313
    cost_matrix = np.zeros((n_subassemblies, n_steps))
1✔
1314
    high_cost = max_distance * 10
1✔
1315

1316
    for i, sa in enumerate(subassemblies):
1✔
1317
        sa_center = sa.bbox.center
1✔
1318
        for j, step in enumerate(steps):
1✔
1319
            # Use diagram center if available, otherwise step bbox center
1320
            if step.diagram:
1✔
1321
                target_bbox = step.diagram.bbox
1✔
1322
                target_center = target_bbox.center
1✔
1323
            else:
UNCOV
1324
                target_bbox = step.bbox
×
UNCOV
1325
                target_center = step.step_number.bbox.center
×
1326

1327
            # Base cost is distance from diagram/step to subassembly center
1328
            distance = (
1✔
1329
                (target_center[0] - sa_center[0]) ** 2
1330
                + (target_center[1] - sa_center[1]) ** 2
1331
            ) ** 0.5
1332

1333
            # Check for divider between subassembly and step's diagram
1334
            if has_divider_between(sa.bbox, target_bbox, divider_bboxes):
1✔
1335
                # High cost if there's a divider between
1336
                distance = high_cost
1✔
1337
                log.debug(
1✔
1338
                    "[step]   Divider between subassembly[%d] at %s and step[%d]",
1339
                    i,
1340
                    sa.bbox,
1341
                    step.step_number.value,
1342
                )
1343

1344
            cost_matrix[i, j] = distance
1✔
1345
            log.debug(
1✔
1346
                "[step]   Cost subassembly[%d] at %s to step[%d]: %.1f",
1347
                i,
1348
                sa.bbox,
1349
                step.step_number.value,
1350
                distance,
1351
            )
1352

1353
    # Apply max_distance threshold
1354
    cost_matrix_thresholded = np.where(
1✔
1355
        cost_matrix > max_distance, high_cost, cost_matrix
1356
    )
1357

1358
    # Run Hungarian algorithm
1359
    row_indices, col_indices = linear_sum_assignment(cost_matrix_thresholded)
1✔
1360

1361
    # Assign subassemblies to steps based on the matching
1362
    # Note: Unlike diagrams, multiple subassemblies can be assigned to one step
1363
    # The Hungarian algorithm gives us the best 1:1 matching, but we may want
1364
    # to assign additional nearby subassemblies too
1365

1366
    # First, do the 1:1 optimal assignment
1367
    assigned_subassemblies: set[int] = set()
1✔
1368
    for row_idx, col_idx in zip(row_indices, col_indices, strict=True):
1✔
1369
        if cost_matrix[row_idx, col_idx] <= max_distance:
1✔
1370
            sa = subassemblies[row_idx]
1✔
1371
            step = steps[col_idx]
1✔
1372
            # Add to step's subassemblies list
1373
            new_subassemblies = list(step.subassemblies) + [sa]
1✔
1374
            object.__setattr__(step, "subassemblies", new_subassemblies)
1✔
1375
            assigned_subassemblies.add(row_idx)
1✔
1376
            log.debug(
1✔
1377
                "[step] Assigned subassembly at %s to step %d (cost=%.1f)",
1378
                sa.bbox,
1379
                step.step_number.value,
1380
                cost_matrix[row_idx, col_idx],
1381
            )
1382

1383
    # Then, try to assign remaining unconsumed subassemblies to their nearest step
1384
    # (if within max_distance and no divider between)
1385
    for i, sa in enumerate(subassemblies):
1✔
1386
        if i in assigned_subassemblies:
1✔
1387
            continue
1✔
1388

1389
        # Find the step with lowest cost for this subassembly
1390
        best_step_idx = None
1✔
1391
        best_cost = high_cost
1✔
1392
        for j in range(len(steps)):
1✔
1393
            if cost_matrix[i, j] < best_cost:
1✔
1394
                best_cost = cost_matrix[i, j]
1✔
1395
                best_step_idx = j
1✔
1396

1397
        if best_step_idx is not None and best_cost <= max_distance:
1✔
1398
            step = steps[best_step_idx]
1✔
1399
            new_subassemblies = list(step.subassemblies) + [sa]
1✔
1400
            object.__setattr__(step, "subassemblies", new_subassemblies)
1✔
1401
            log.debug(
1✔
1402
                "[step] Assigned extra subassembly at %s to step %d (cost=%.1f)",
1403
                sa.bbox,
1404
                step.step_number.value,
1405
                best_cost,
1406
            )
1407

1408

1409
def assign_rotation_symbols_to_steps(
1✔
1410
    steps: Sequence[Step],
1411
    rotation_symbols: list[RotationSymbol],
1412
    max_distance: float = 300.0,
1413
) -> None:
1414
    """Assign rotation symbols to steps using Hungarian algorithm.
1415

1416
    Uses optimal bipartite matching to pair rotation symbols with steps based on
1417
    minimum distance from the rotation symbol to the step's diagram (or step bbox
1418
    center if no diagram).
1419

1420
    This function mutates the Step objects in place, setting their rotation_symbol
1421
    attribute.
1422

1423
    Args:
1424
        steps: List of Step objects to assign rotation symbols to
1425
        rotation_symbols: List of RotationSymbol objects to assign
1426
        max_distance: Maximum distance for a valid assignment. Pairs with distance
1427
            greater than this will not be matched.
1428
    """
1429
    if not steps or not rotation_symbols:
1✔
1430
        log.debug(
1✔
1431
            "[step] No rotation symbol assignment needed: "
1432
            "steps=%d, rotation_symbols=%d",
1433
            len(steps),
1434
            len(rotation_symbols),
1435
        )
1436
        return
1✔
1437

1438
    n_steps = len(steps)
1✔
1439
    n_symbols = len(rotation_symbols)
1✔
1440

1441
    log.debug(
1✔
1442
        "[step] Running Hungarian matching: %d steps, %d rotation symbols",
1443
        n_steps,
1444
        n_symbols,
1445
    )
1446

1447
    # Build cost matrix: rows = rotation symbols, cols = steps
1448
    # Cost = distance from rotation symbol center to step's diagram center
1449
    # (or step bbox center if no diagram)
1450
    cost_matrix = np.zeros((n_symbols, n_steps))
1✔
1451

1452
    for i, rs in enumerate(rotation_symbols):
1✔
1453
        rs_center = rs.bbox.center
1✔
1454
        for j, step in enumerate(steps):
1✔
1455
            # Use diagram center if available, otherwise step bbox center
1456
            if step.diagram:
1✔
1457
                target_center = step.diagram.bbox.center
1✔
1458
            else:
UNCOV
1459
                target_center = step.bbox.center
×
1460

1461
            # Euclidean distance
1462
            distance = (
1✔
1463
                (rs_center[0] - target_center[0]) ** 2
1464
                + (rs_center[1] - target_center[1]) ** 2
1465
            ) ** 0.5
1466

1467
            cost_matrix[i, j] = distance
1✔
1468
            log.debug(
1✔
1469
                "[step]   Distance from rotation_symbol[%d] at %s to step[%d]: %.1f",
1470
                i,
1471
                rs.bbox,
1472
                step.step_number.value,
1473
                distance,
1474
            )
1475

1476
    # Apply max_distance threshold by setting costs above threshold to a high value
1477
    # This prevents assignments that are too far apart
1478
    high_cost = max_distance * 10  # Arbitrary large value
1✔
1479
    cost_matrix_thresholded = np.where(
1✔
1480
        cost_matrix > max_distance, high_cost, cost_matrix
1481
    )
1482

1483
    # Run Hungarian algorithm
1484
    row_indices, col_indices = linear_sum_assignment(cost_matrix_thresholded)
1✔
1485

1486
    # Assign rotation symbols to steps based on the matching
1487
    for row_idx, col_idx in zip(row_indices, col_indices, strict=True):
1✔
1488
        # Check if this assignment is within the max_distance threshold
1489
        if cost_matrix[row_idx, col_idx] <= max_distance:
1✔
1490
            rs = rotation_symbols[row_idx]
1✔
1491
            step = steps[col_idx]
1✔
1492
            step.rotation_symbol = rs
1✔
1493
            # Expand step bbox to include the rotation symbol
1494
            step.bbox = step.bbox.union(rs.bbox)
1✔
1495
            log.debug(
1✔
1496
                "[step] Assigned rotation symbol at %s to step %d "
1497
                "(distance=%.1f, new step bbox=%s)",
1498
                rs.bbox,
1499
                step.step_number.value,
1500
                cost_matrix[row_idx, col_idx],
1501
                step.bbox,
1502
            )
1503
        else:
UNCOV
1504
            log.debug(
×
1505
                "[step] Skipped assignment: rotation_symbol[%d] to step[%d] "
1506
                "distance=%.1f > max_distance=%.1f",
1507
                row_idx,
1508
                col_indices[row_idx] if row_idx < len(col_indices) else -1,
1509
                cost_matrix[row_idx, col_idx],
1510
                max_distance,
1511
            )
1512

1513

1514
def filter_subassembly_values[T](
1✔
1515
    items: list[tuple[int, T]],
1516
    *,
1517
    min_gap: int = 3,
1518
    max_subassembly_start: int = 3,
1519
) -> list[tuple[int, T]]:
1520
    """Filter items to exclude likely subassembly values.
1521

1522
    Subassembly steps (e.g., step 1, 2 inside a subassembly box) often appear
1523
    alongside higher-numbered page-level steps (e.g., 15, 16). This creates
1524
    sequences like [1, 2, 15, 16] which cannot all be page-level steps.
1525

1526
    This function identifies such cases by detecting a significant gap in values
1527
    and filtering out the lower-numbered items that are likely subassembly steps.
1528

1529
    Args:
1530
        items: List of (value, item) tuples where value is the step number
1531
            and item is any associated data (e.g., a Candidate).
1532
        min_gap: Minimum gap size to trigger filtering (exclusive).
1533
            Default is 3, so gaps > 3 trigger filtering.
1534
        max_subassembly_start: Maximum starting value for subassembly detection.
1535
            If min value is <= this, filtering is applied. Default is 3.
1536

1537
    Returns:
1538
        Filtered list of (value, item) tuples, keeping only the higher values
1539
        if a subassembly pattern is detected. Returns original list if no
1540
        filtering is needed.
1541

1542
    Examples:
1543
        >>> filter_subassembly_values([(1, "a"), (2, "b"), (15, "c"), (16, "d")])
1544
        [(15, "c"), (16, "d")]
1545

1546
        >>> filter_subassembly_values([(5, "a"), (6, "b"), (15, "c")])
1547
        [(5, "a"), (6, "b"), (15, "c")]  # min=5 > 3, no filtering
1548
    """
1549
    if len(items) <= 1:
1✔
1550
        return items
1✔
1551

1552
    # Sort by value
1553
    sorted_items = sorted(items, key=lambda x: x[0])
1✔
1554
    values = [v for v, _ in sorted_items]
1✔
1555

1556
    # Find the largest gap between consecutive values
1557
    max_gap_size = 0
1✔
1558
    max_gap_index = -1
1✔
1559
    for i in range(len(values) - 1):
1✔
1560
        gap = values[i + 1] - values[i]
1✔
1561
        if gap > max_gap_size:
1✔
1562
            max_gap_size = gap
1✔
1563
            max_gap_index = i
1✔
1564

1565
    # Check if filtering should occur:
1566
    # 1. Gap must be larger than min_gap
1567
    # 2. The minimum value must be <= max_subassembly_start
1568
    if max_gap_size > min_gap and max_gap_index >= 0:
1✔
1569
        min_value = values[0]
1✔
1570
        if min_value <= max_subassembly_start:
1✔
1571
            threshold = values[max_gap_index + 1]
1✔
1572
            return [(v, item) for v, item in sorted_items if v >= threshold]
1✔
1573

1574
    return items
1✔
1575

1576

1577
def filter_valid_substep_sequence[T](
1✔
1578
    items: list[T],
1579
    get_value: Callable[[T], int],
1580
) -> list[T]:
1581
    """Filter substeps to only include valid sequential values starting from 1.
1582

1583
    Substeps within a subassembly or page must form a valid sequence:
1584
    - Must start from 1
1585
    - Values must be consecutive (1, 2, 3, ... not 1, 3, 5)
1586

1587
    This filters out isolated numbers that are likely PieceLengths or other
1588
    misidentified elements.
1589

1590
    Args:
1591
        items: List of substep items to filter
1592
        get_value: Function to extract the step number value from an item
1593

1594
    Returns:
1595
        Filtered list containing only items that form a valid sequence from 1.
1596
        Returns empty list if no valid sequence starting from 1 is found.
1597

1598
    Examples:
1599
        >>> filter_valid_substep_sequence([1, 2, 3], lambda x: x)
1600
        [1, 2, 3]
1601

1602
        >>> filter_valid_substep_sequence([3, 5], lambda x: x)
1603
        []  # No item with value 1
1604

1605
        >>> filter_valid_substep_sequence([1, 2, 5], lambda x: x)
1606
        [1, 2]  # 5 breaks the sequence
1607
    """
1608
    if not items:
1✔
UNCOV
1609
        return []
×
1610

1611
    # Build a map from value to items
1612
    value_to_items: dict[int, list[T]] = {}
1✔
1613
    for item in items:
1✔
1614
        value = get_value(item)
1✔
1615
        if value not in value_to_items:
1✔
1616
            value_to_items[value] = []
1✔
1617
        value_to_items[value].append(item)
1✔
1618

1619
    # Must have step 1 to be a valid sequence
1620
    if 1 not in value_to_items:
1✔
1621
        log.debug("[substep_filter] No substep with value 1 found - rejecting all")
1✔
1622
        return []
1✔
1623

1624
    # Collect items in sequence starting from 1
1625
    result: list[T] = []
1✔
1626
    expected_value = 1
1✔
1627

1628
    while expected_value in value_to_items:
1✔
1629
        result.extend(value_to_items[expected_value])
1✔
1630
        expected_value += 1
1✔
1631

1632
    log.debug(
1✔
1633
        "[substep_filter] Filtered %d items to %d valid sequential substeps (1-%d)",
1634
        len(items),
1635
        len(result),
1636
        expected_value - 1,
1637
    )
1638

1639
    return result
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc