• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 20204946005

14 Dec 2025 07:59AM UTC coverage: 87.002% (-3.8%) from 90.787%
20204946005

push

github

bramp
Optimize fixture test serialization with transform_for_json

- Combine round_floats and reorder_tag_first into single transform_for_json
  function, reducing serialization from 3 passes to 2
- Simplify fixture test to read JSON directly from disk instead of
  parsing through Pydantic then re-serializing
- Remove redundant round_floats and reorder_tag_first functions
- Update utils_test.py with tests for transform_for_json

Test time reduced from ~51s to ~35s (~30% improvement)

47 of 55 new or added lines in 4 files covered. (85.45%)

560 existing lines in 26 files now uncovered.

12349 of 14194 relevant lines covered (87.0%)

0.87 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

64.97
/src/build_a_long/pdf_extract/classifier/steps/step_classifier.py
1
"""
2
Step classifier.
3

4
Purpose
5
-------
6
Identify complete Step structures by combining step_number, parts_list, and diagram
7
elements. A Step represents a single building instruction comprising:
8
- A StepNumber label
9
- An optional PartsList (the parts needed for this step)
10
- A Diagram (the main instruction graphic showing what to build)
11

12
We look for step_numbers and attempt to pair them with nearby parts_lists and
13
identify the appropriate diagram region for each step.
14

15
Debugging
16
---------
17
Set environment variables to aid investigation without code changes:
18

19
- LOG_LEVEL=DEBUG
20
    Enables DEBUG-level logging (if not already configured by caller).
21
"""
22

23
import logging
1✔
24

25
import numpy as np
1✔
26
from scipy.optimize import linear_sum_assignment
1✔
27

28
from build_a_long.pdf_extract.classifier.candidate import Candidate
1✔
29
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
30
    CandidateFailedError,
31
    ClassificationResult,
32
)
33
from build_a_long.pdf_extract.classifier.label_classifier import (
1✔
34
    LabelClassifier,
35
)
36
from build_a_long.pdf_extract.classifier.score import (
1✔
37
    Score,
38
    Weight,
39
    find_best_scoring,
40
)
41
from build_a_long.pdf_extract.classifier.text import (
1✔
42
    extract_step_number_value,
43
)
44
from build_a_long.pdf_extract.extractor.bbox import BBox, filter_overlapping
1✔
45
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
46
    Arrow,
47
    Diagram,
48
    Divider,
49
    LegoPageElements,
50
    PartsList,
51
    RotationSymbol,
52
    Step,
53
    StepNumber,
54
    SubAssembly,
55
)
56
from build_a_long.pdf_extract.extractor.page_blocks import Text
1✔
57

58
log = logging.getLogger(__name__)
1✔
59

60

61
class _StepScore(Score):
1✔
62
    """Internal score representation for step classification."""
63

64
    step_value: int
65
    """The parsed step number value (e.g., 1, 2, 3)."""
1✔
66

67
    step_number_candidate: Candidate
68
    """The step number candidate this step is associated with."""
1✔
69

70
    parts_list_candidate: Candidate | None
71
    """The parts list candidate paired with this step (if any)."""
1✔
72

73
    has_parts_list: bool
74
    """Whether this step has an associated parts list."""
1✔
75

76
    step_proximity_score: float
77
    """Score based on proximity to the PartsList above (0.0-1.0).
1✔
78
    1.0 for closest proximity, 0.0 if very far. 0.0 if no parts list."""
79

80
    step_alignment_score: float
81
    """Score based on left-edge alignment with PartsList above (0.0-1.0).
1✔
82
    1.0 is perfect alignment, 0.0 is very misaligned. 0.0 if no parts list."""
83

84
    def score(self) -> Weight:
1✔
85
        """Return the overall pairing score."""
86
        return self.overall_score()
×
87

88
    def overall_score(self) -> float:
1✔
89
        """Calculate overall quality score based on parts list pairing.
90

91
        Steps with a parts_list are given a bonus to prefer them over
92
        steps without parts_list. Diagrams are found at build time, not
93
        during scoring, to allow rotation symbols to claim small images first.
94
        """
95
        if self.has_parts_list:
1✔
96
            # Base score for having parts_list + proximity/alignment bonus
97
            parts_list_bonus = 0.5
1✔
98
            pairing_score = (
1✔
99
                self.step_proximity_score + self.step_alignment_score
100
            ) / 2.0
101
            return parts_list_bonus + 0.5 * pairing_score
1✔
102
        return 0.3  # Lower base score for steps without parts list
1✔
103

104
    def sort_key(self) -> tuple[float, int]:
1✔
105
        """Return a tuple for sorting candidates.
106

107
        We prefer:
108
        1. Higher overall scores (better StepNumber-PartsList-Diagram match)
109
        2. Lower step number values (to break ties and maintain order)
110
        """
111
        return (-self.overall_score(), self.step_value)
1✔
112

113

114
class StepClassifier(LabelClassifier):
1✔
115
    """Classifier for complete Step structures."""
116

117
    output = "step"
1✔
118
    requires = frozenset(
1✔
119
        {
120
            "arrow",
121
            "step_number",
122
            "parts_list",
123
            "diagram",
124
            "rotation_symbol",
125
            "subassembly",
126
            "preview",
127
        }
128
    )
129

130
    def _score(self, result: ClassificationResult) -> None:
1✔
131
        """Score step pairings and create candidates."""
132
        page_data = result.page_data
1✔
133

134
        # Get step number and parts list candidates (not constructed elements)
135
        step_number_candidates = result.get_scored_candidates(
1✔
136
            "step_number", valid_only=False, exclude_failed=True
137
        )
138

139
        if not step_number_candidates:
1✔
140
            return
1✔
141

142
        # Get parts_list candidates
143
        parts_list_candidates = result.get_scored_candidates(
1✔
144
            "parts_list",
145
            valid_only=False,
146
            exclude_failed=True,
147
        )
148

149
        log.debug(
1✔
150
            "[step] page=%s step_candidates=%d parts_list_candidates=%d",
151
            page_data.page_number,
152
            len(step_number_candidates),
153
            len(parts_list_candidates),
154
        )
155

156
        # Create all possible Step candidates for pairings (without diagrams initially)
157
        # Deduplication happens at build time, not scoring time, so that
158
        # diagram assignment can be re-evaluated as blocks get claimed.
159
        all_candidates: list[Candidate] = []
1✔
160
        for step_candidate in step_number_candidates:
1✔
161
            # Create candidates for this StepNumber paired with each PartsList
162
            for parts_list_candidate in parts_list_candidates:
1✔
163
                candidate = self._create_step_candidate(
1✔
164
                    step_candidate, parts_list_candidate, result
165
                )
166
                if candidate:
1✔
167
                    all_candidates.append(candidate)
1✔
168

169
            # Also create a candidate with no PartsList (fallback)
170
            candidate = self._create_step_candidate(step_candidate, None, result)
1✔
171
            if candidate:
1✔
172
                all_candidates.append(candidate)
1✔
173

174
        # Add all candidates to result (deduplication happens at build time)
175
        for candidate in all_candidates:
1✔
176
            result.add_candidate(candidate)
1✔
177

178
        log.debug(
1✔
179
            "[step] Created %d step candidates",
180
            len(all_candidates),
181
        )
182

183
    def build_all(self, result: ClassificationResult) -> list[LegoPageElements]:
1✔
184
        """Build all Step elements with coordinated element assignment.
185

186
        Build Order (phases run sequentially):
187
        - Phase 1: Rotation symbols - claim small circular arrow graphics
188
        - Phase 2: Parts lists - claim part count blocks and part images
189
        - Phase 3: Arrows - claim arrowhead triangles and shaft lines/rectangles
190
        - Phase 4: Subassemblies & Previews - claim white callout boxes
191
        - Phase 5: Steps - build step_number + parts_list pairs (no diagram yet)
192
        - Phase 6: Diagrams - build remaining unclaimed diagram regions
193
        - Phase 7: Assign diagrams to steps via Hungarian matching
194
        - Phase 8: Assign subassemblies to steps via Hungarian matching
195
        - Phase 9: Finalize steps - add arrows, compute final bboxes
196
        - Phase 10: Assign rotation symbols to steps via Hungarian matching
197

198
        The order matters because elements compete for the same Drawing blocks.
199
        Earlier phases claim blocks first, preventing later phases from using them.
200
        For example:
201
        - Arrows must build before subassemblies so arrow shafts aren't consumed
202
          by subassembly white boxes
203
        - Rotation symbols must build before diagrams so small circular graphics
204
          aren't incorrectly clustered into diagrams
205

206
        Deduplication of steps happens at build time (not scoring time) so that
207
        diagram assignment can be re-evaluated as blocks get claimed.
208

209
        Returns:
210
            List of successfully constructed Step elements, sorted by step number
211
        """
212
        # Phase 1: Build all rotation symbols BEFORE steps.
213
        # This allows rotation symbols to claim their Drawing blocks first,
214
        # preventing them from being incorrectly clustered into diagrams.
215
        for rs_candidate in result.get_scored_candidates(
1✔
216
            "rotation_symbol", valid_only=False, exclude_failed=True
217
        ):
UNCOV
218
            try:
×
UNCOV
219
                result.build(rs_candidate)
×
UNCOV
220
                log.debug(
×
221
                    "[step] Built rotation symbol at %s (score=%.2f)",
222
                    rs_candidate.bbox,
223
                    rs_candidate.score,
224
                )
225
            except Exception as e:
×
226
                log.debug(
×
227
                    "[step] Failed to construct rotation_symbol candidate at %s: %s",
228
                    rs_candidate.bbox,
229
                    e,
230
                )
231

232
        # Phase 2: Build all parts lists BEFORE steps.
233
        # This allows parts lists to claim their Drawing blocks first,
234
        # preventing them from being claimed by subassemblies.
235
        for pl_candidate in result.get_scored_candidates(
1✔
236
            "parts_list", valid_only=False, exclude_failed=True
237
        ):
238
            try:
1✔
239
                result.build(pl_candidate)
1✔
240
                log.debug(
1✔
241
                    "[step] Built parts_list at %s (score=%.2f)",
242
                    pl_candidate.bbox,
243
                    pl_candidate.score,
244
                )
245
            except Exception as e:
×
246
                log.debug(
×
247
                    "[step] Failed to construct parts_list candidate at %s: %s",
248
                    pl_candidate.bbox,
249
                    e,
250
                )
251

252
        # Phase 3: Build all arrows BEFORE subassemblies.
253
        # Arrows connect subassembly callout boxes to main diagrams. The arrow
254
        # shaft may be a thin rectangle that overlaps the subassembly's white
255
        # box. By building arrows first, they claim the shaft blocks before
256
        # subassemblies can consume them.
257
        for arrow_candidate in result.get_scored_candidates(
1✔
258
            "arrow", valid_only=False, exclude_failed=True
259
        ):
260
            try:
1✔
261
                result.build(arrow_candidate)
1✔
262
                log.debug(
1✔
263
                    "[step] Built arrow at %s (score=%.2f)",
264
                    arrow_candidate.bbox,
265
                    arrow_candidate.score,
266
                )
267
            except Exception as e:
×
268
                log.debug(
×
269
                    "[step] Failed to construct arrow candidate at %s: %s",
270
                    arrow_candidate.bbox,
271
                    e,
272
                )
273

274
        # Phase 4: Build subassemblies and previews BEFORE steps.
275
        # Both subassemblies and previews are white boxes with diagrams inside.
276
        # We combine them and build in score order so the higher-scoring
277
        # candidate claims the white box first. When a candidate is built,
278
        # its source_blocks are marked as consumed, causing any competing
279
        # candidate using the same blocks to fail.
280
        #
281
        # This allows subassemblies (which have step_count labels like "2x")
282
        # to be distinguished from previews (which appear before steps).
283
        subassembly_candidates = result.get_scored_candidates(
1✔
284
            "subassembly", valid_only=False, exclude_failed=True
285
        )
286
        preview_candidates = result.get_scored_candidates(
1✔
287
            "preview", valid_only=False, exclude_failed=True
288
        )
289

290
        # Combine and sort by score (highest first)
291
        combined_candidates = list(subassembly_candidates) + list(preview_candidates)
1✔
292
        combined_candidates.sort(key=lambda c: c.score, reverse=True)
1✔
293

294
        for candidate in combined_candidates:
1✔
295
            try:
1✔
296
                result.build(candidate)
1✔
297
                log.debug(
1✔
298
                    "[step] Built %s at %s (score=%.2f)",
299
                    candidate.label,
300
                    candidate.bbox,
301
                    candidate.score,
302
                )
UNCOV
303
            except Exception as e:
×
UNCOV
304
                log.debug(
×
305
                    "[step] Failed to construct %s candidate at %s: %s",
306
                    candidate.label,
307
                    candidate.bbox,
308
                    e,
309
                )
310

311
        # Phase 5: Build Step candidates with deduplication by step value
312
        # Filter out subassembly steps, then build in score order, skipping
313
        # step values that have already been built.
314
        # Steps are built as "partial" - just step_number + parts_list.
315
        # Diagrams and subassemblies are assigned later via Hungarian matching.
316
        all_step_candidates = result.get_scored_candidates(
1✔
317
            "step", valid_only=False, exclude_failed=True
318
        )
319
        page_level_step_candidates = self._filter_page_level_step_candidates(
1✔
320
            all_step_candidates
321
        )
322

323
        # Sort by score (highest first) so best candidates get built first
324
        sorted_candidates = sorted(
1✔
325
            page_level_step_candidates,
326
            key=lambda c: c.score,
327
            reverse=True,
328
        )
329

330
        steps: list[Step] = []
1✔
331
        built_step_values: set[int] = set()
1✔
332

333
        for step_candidate in sorted_candidates:
1✔
334
            # Get step value from score
335
            score = step_candidate.score_details
1✔
336
            if not isinstance(score, _StepScore):
1✔
337
                continue
×
338

339
            # Skip if we've already built a step with this value
340
            if score.step_value in built_step_values:
1✔
341
                log.debug(
1✔
342
                    "[step] Skipping duplicate step %d candidate (score=%.2f)",
343
                    score.step_value,
344
                    step_candidate.score,
345
                )
346
                continue
1✔
347

348
            try:
1✔
349
                elem = result.build(step_candidate)
1✔
350
                assert isinstance(elem, Step)
1✔
351
                steps.append(elem)
1✔
352
                built_step_values.add(score.step_value)
1✔
353
                log.debug(
1✔
354
                    "[step] Built partial step %d (parts_list=%s, score=%.2f)",
355
                    score.step_value,
356
                    "yes" if score.parts_list_candidate is not None else "no",
357
                    step_candidate.score,
358
                )
359
            except Exception as e:
1✔
360
                log.debug(
1✔
361
                    "[step] Failed to construct step %d candidate: %s",
362
                    score.step_value,
363
                    e,
364
                )
365

366
        # Sort steps by their step_number value
367
        steps.sort(key=lambda step: step.step_number.value)
1✔
368

369
        # Phase 6: Build diagrams (not yet claimed by subassemblies)
370
        # Phase 7: Assign diagrams to steps using Hungarian matching
371
        # Collect available diagram candidates (not yet claimed by subassemblies)
372
        available_diagrams: list[Diagram] = []
1✔
373
        for diag_candidate in result.get_scored_candidates(
1✔
374
            "diagram", valid_only=False, exclude_failed=True
375
        ):
376
            if diag_candidate.constructed is None:
1✔
377
                # Build the diagram
378
                try:
1✔
379
                    diag_elem = result.build(diag_candidate)
1✔
380
                    assert isinstance(diag_elem, Diagram)
1✔
381
                    available_diagrams.append(diag_elem)
1✔
382
                except Exception as e:
1✔
383
                    log.debug(
1✔
384
                        "[step] Failed to build diagram at %s: %s",
385
                        diag_candidate.bbox,
386
                        e,
387
                    )
388
            elif isinstance(diag_candidate.constructed, Diagram):
1✔
389
                # Already built (claimed by subassembly) - skip
390
                pass
1✔
391

392
        log.debug(
1✔
393
            "[step] Available diagrams for step assignment: %d",
394
            len(available_diagrams),
395
        )
396

397
        # Assign diagrams to steps using Hungarian matching
398
        assign_diagrams_to_steps(steps, available_diagrams)
1✔
399

400
        # Phase 8: Assign subassemblies to steps using Hungarian matching
401
        # Collect built subassemblies
402
        subassemblies: list[SubAssembly] = []
1✔
403
        for sa_candidate in result.get_scored_candidates(
1✔
404
            "subassembly", valid_only=True
405
        ):
406
            assert sa_candidate.constructed is not None
1✔
407
            assert isinstance(sa_candidate.constructed, SubAssembly)
1✔
408
            subassemblies.append(sa_candidate.constructed)
1✔
409

410
        # Collect built dividers for obstruction checking
411
        dividers: list[Divider] = []
1✔
412
        for div_candidate in result.get_scored_candidates("divider", valid_only=True):
1✔
UNCOV
413
            assert div_candidate.constructed is not None
×
UNCOV
414
            assert isinstance(div_candidate.constructed, Divider)
×
UNCOV
415
            dividers.append(div_candidate.constructed)
×
416

417
        log.debug(
1✔
418
            "[step] Assigning %d subassemblies to %d steps (%d dividers for "
419
            "obstruction checking)",
420
            len(subassemblies),
421
            len(steps),
422
            len(dividers),
423
        )
424

425
        # Assign subassemblies to steps using Hungarian matching
426
        assign_subassemblies_to_steps(steps, subassemblies, dividers)
1✔
427

428
        # Phase 9: Finalize steps - collect arrows and compute final bboxes
429
        page_bbox = result.page_data.bbox
1✔
430
        for step in steps:
1✔
431
            # Get arrows for this step
432
            arrows = self._get_arrows_for_step(step.step_number, step.diagram, result)
1✔
433

434
            # Compute final bbox including all components
435
            final_bbox = self._compute_step_bbox(
1✔
436
                step.step_number,
437
                step.parts_list,
438
                step.diagram,
439
                step.subassemblies,
440
                page_bbox,
441
            )
442

443
            # Update the step (Step is a Pydantic model, so we need to reassign)
444
            # We mutate in place by directly setting attributes
445
            object.__setattr__(step, "arrows", arrows)
1✔
446
            object.__setattr__(step, "bbox", final_bbox)
1✔
447

448
        # Phase 10: Assign rotation symbols to steps using Hungarian matching
449
        # Collect built rotation symbols
450
        rotation_symbols: list[RotationSymbol] = []
1✔
451
        for rs_candidate in result.get_scored_candidates(
1✔
452
            "rotation_symbol", valid_only=True
453
        ):
UNCOV
454
            assert rs_candidate.constructed is not None
×
UNCOV
455
            assert isinstance(rs_candidate.constructed, RotationSymbol)
×
UNCOV
456
            rotation_symbols.append(rs_candidate.constructed)
×
457

458
        assign_rotation_symbols_to_steps(steps, rotation_symbols)
1✔
459

460
        log.debug(
1✔
461
            "[step] build_all complete: %d steps, %d rotation symbols assigned",
462
            len(steps),
463
            sum(1 for s in steps if s.rotation_symbol is not None),
464
        )
465

466
        # Cast for type compatibility with base class return type
467
        return list[LegoPageElements](steps)
1✔
468

469
    def _filter_page_level_step_candidates(
1✔
470
        self, candidates: list[Candidate]
471
    ) -> list[Candidate]:
472
        """Filter step candidates to exclude likely subassembly steps.
473

474
        Extracts step number values from candidates and uses the generic
475
        filter_subassembly_values function to filter out subassembly steps.
476

477
        Args:
478
            candidates: All step candidates to filter
479

480
        Returns:
481
            Filtered list of candidates likely to be page-level steps
482
        """
483
        # Extract step number values from candidates
484
        items_with_values: list[tuple[int, Candidate]] = []
1✔
485
        for candidate in candidates:
1✔
486
            score = candidate.score_details
1✔
487
            if not isinstance(score, _StepScore):
1✔
488
                continue
×
489
            items_with_values.append((score.step_value, candidate))
1✔
490

491
        # Use generic filtering function
492
        filtered_items = filter_subassembly_values(items_with_values)
1✔
493

494
        # If filtering occurred, return only the filtered candidates
495
        if len(filtered_items) != len(items_with_values):
1✔
496
            filtered_values = [v for v, _ in filtered_items]
1✔
497
            log.debug(
1✔
498
                "[step] Filtered out likely subassembly steps, keeping values: %s",
499
                sorted(filtered_values),
500
            )
501
            return [item for _, item in filtered_items]
1✔
502

503
        # No filtering occurred, return original candidates
504
        # (includes any candidates that couldn't have their value extracted)
505
        return candidates
1✔
506

507
    def build(self, candidate: Candidate, result: ClassificationResult) -> Step:
1✔
508
        """Construct a partial Step element from a single candidate.
509

510
        This creates a Step with just step_number and parts_list. The diagram,
511
        subassemblies, and arrows are assigned later in build_all() using
512
        Hungarian matching for optimal global assignment.
513
        """
514
        score = candidate.score_details
1✔
515
        assert isinstance(score, _StepScore)
1✔
516

517
        # Validate and extract step number from parent candidate
518
        step_num_candidate = score.step_number_candidate
1✔
519

520
        step_num_elem = result.build(step_num_candidate)
1✔
521
        assert isinstance(step_num_elem, StepNumber)
1✔
522
        step_num = step_num_elem
1✔
523

524
        # Validate and extract parts list from parent candidate (if present)
525
        parts_list = None
1✔
526
        if score.parts_list_candidate:
1✔
527
            parts_list_candidate = score.parts_list_candidate
1✔
528
            parts_list_elem = result.build(parts_list_candidate)
1✔
529
            assert isinstance(parts_list_elem, PartsList)
1✔
530
            parts_list = parts_list_elem
1✔
531

532
        # Create partial step - diagram, subassemblies, arrows assigned later
533
        initial_bbox = step_num.bbox
1✔
534
        if parts_list:
1✔
535
            initial_bbox = initial_bbox.union(parts_list.bbox)
1✔
536

537
        return Step(
1✔
538
            bbox=initial_bbox,
539
            step_number=step_num,
540
            parts_list=parts_list,
541
            diagram=None,
542
            rotation_symbol=None,
543
            arrows=[],
544
            subassemblies=[],
545
        )
546

547
    def _find_and_build_diagram_for_step(
1✔
548
        self,
549
        step_num: StepNumber,
550
        parts_list: PartsList | None,
551
        result: ClassificationResult,
552
    ) -> Diagram | None:
553
        """Find and build the best diagram for this step.
554

555
        This is called at build time, after rotation symbols and subassemblies
556
        have been built (in build_all Phases 1 and 3), so they've already
557
        claimed any images/diagrams they need. We filter to only consider
558
        diagram candidates that haven't been constructed yet.
559

560
        Args:
561
            step_num: The built step number element
562
            parts_list: The built parts list element (if any)
563
            result: Classification result containing diagram candidates
564

565
        Returns:
566
            The built Diagram element, or None if no suitable diagram found
567
        """
568
        # Get all non-failed, non-constructed diagram candidates
569
        diagram_candidates = result.get_scored_candidates(
×
570
            "diagram", valid_only=False, exclude_failed=True
571
        )
572

573
        # Filter to only candidates that haven't been built yet
574
        # (subassemblies and rotation symbols built earlier may have claimed some)
575
        available_candidates = [c for c in diagram_candidates if c.constructed is None]
×
576

577
        if not available_candidates:
×
578
            log.debug(
×
579
                "[step] No diagram candidates available for step %d",
580
                step_num.value,
581
            )
582
            return None
×
583

584
        # Score each candidate based on position relative to step
585
        step_bbox = step_num.bbox
×
586
        best_candidate = None
×
587
        best_score = -float("inf")
×
588

589
        for candidate in available_candidates:
×
590
            score = self._score_step_diagram_pair(step_bbox, candidate.bbox)
×
591

592
            log.debug(
×
593
                "[step] Diagram candidate at %s for step %d: score=%.2f",
594
                candidate.bbox,
595
                step_num.value,
596
                score,
597
            )
598

599
            if score > best_score:
×
600
                best_score = score
×
601
                best_candidate = candidate
×
602

603
        if best_candidate is None or best_score < 0.2:
×
604
            log.debug(
×
605
                "[step] No suitable diagram found for step %d (best_score=%.2f)",
606
                step_num.value,
607
                best_score,
608
            )
609
            return None
×
610

611
        # Build the diagram
612
        try:
×
613
            diagram_elem = result.build(best_candidate)
×
614
            assert isinstance(diagram_elem, Diagram)
×
615
            log.debug(
×
616
                "[step] Built diagram at %s for step %d (score=%.2f)",
617
                diagram_elem.bbox,
618
                step_num.value,
619
                best_score,
620
            )
621
            return diagram_elem
×
622
        except CandidateFailedError as e:
×
623
            log.debug(
×
624
                "[step] Failed to build diagram for step %d: %s",
625
                step_num.value,
626
                e,
627
            )
628
            return None
×
629

630
    def _create_step_candidate(
1✔
631
        self,
632
        step_candidate: Candidate,
633
        parts_list_candidate: Candidate | None,
634
        result: ClassificationResult,
635
    ) -> Candidate | None:
636
        """Create a Step candidate (without diagram assignment).
637

638
        Diagrams are found at build time, not during scoring, to allow
639
        rotation symbols to claim small images first.
640

641
        Args:
642
            step_candidate: The StepNumber candidate for this step
643
            parts_list_candidate: The PartsList candidate to pair with (or None)
644
            result: Classification result
645

646
        Returns:
647
            The created Candidate with score but no construction, or None if
648
            the step number value cannot be extracted.
649
        """
650
        ABOVE_EPS = 2.0  # Small epsilon for "above" check
1✔
651
        ALIGNMENT_THRESHOLD_MULTIPLIER = 1.0  # Max horizontal offset
1✔
652
        DISTANCE_THRESHOLD_MULTIPLIER = 1.0  # Max vertical distance
1✔
653

654
        # Extract step number value from the candidate
655
        if not step_candidate.source_blocks:
1✔
656
            return None
×
657
        source_block = step_candidate.source_blocks[0]
1✔
658
        if not isinstance(source_block, Text):
1✔
659
            return None
×
660
        step_value = extract_step_number_value(source_block.text)
1✔
661
        if step_value is None:
1✔
662
            return None
×
663

664
        step_bbox = step_candidate.bbox
1✔
665
        parts_list_bbox = parts_list_candidate.bbox if parts_list_candidate else None
1✔
666

667
        # Calculate pairing scores if there's a parts_list above the step
668
        proximity_score = 0.0
1✔
669
        alignment_score = 0.0
1✔
670

671
        if (
1✔
672
            parts_list_bbox is not None
673
            and parts_list_bbox.y1 <= step_bbox.y0 + ABOVE_EPS
674
        ):
675
            # Calculate distance (how far apart vertically)
676
            distance = step_bbox.y0 - parts_list_bbox.y1
1✔
677

678
            # Calculate proximity score
679
            max_distance = step_bbox.height * DISTANCE_THRESHOLD_MULTIPLIER
1✔
680
            if max_distance > 0:
1✔
681
                proximity_score = max(0.0, 1.0 - (distance / max_distance))
1✔
682

683
            # Calculate alignment score (how well left edges align)
684
            max_alignment_diff = step_bbox.width * ALIGNMENT_THRESHOLD_MULTIPLIER
1✔
685
            left_diff = abs(parts_list_bbox.x0 - step_bbox.x0)
1✔
686
            if max_alignment_diff > 0:
1✔
687
                alignment_score = max(0.0, 1.0 - (left_diff / max_alignment_diff))
1✔
688

689
        # Create score object with candidate references
690
        # Diagrams are found at build time, not during scoring
691
        score = _StepScore(
1✔
692
            step_value=step_value,
693
            step_number_candidate=step_candidate,
694
            parts_list_candidate=parts_list_candidate,
695
            has_parts_list=parts_list_candidate is not None,
696
            step_proximity_score=proximity_score,
697
            step_alignment_score=alignment_score,
698
        )
699

700
        # Calculate combined bbox for the candidate (without diagram)
701
        combined_bbox = step_bbox
1✔
702
        if parts_list_bbox:
1✔
703
            combined_bbox = BBox.union(combined_bbox, parts_list_bbox)
1✔
704

705
        # Create candidate
706
        return Candidate(
1✔
707
            bbox=combined_bbox,
708
            label="step",
709
            score=score.overall_score(),
710
            score_details=score,
711
            source_blocks=[],
712
        )
713

714
    def _score_step_diagram_pair(
1✔
715
        self,
716
        step_bbox: BBox,
717
        diagram_bbox: BBox,
718
    ) -> float:
719
        """Score how well a diagram matches a step.
720

721
        Diagrams are typically positioned to the right of and/or below the step
722
        number. This method scores based on:
723
        - Horizontal position: prefer diagrams to the right, penalize left
724
        - Vertical position: prefer diagrams below the step header
725
        - Distance: closer is better
726

727
        Args:
728
            step_bbox: The step number bounding box
729
            diagram_bbox: The diagram bounding box to score
730

731
        Returns:
732
            Score between 0.0 and 1.0 (higher is better match)
733
        """
734
        # Reference point: bottom-right of step number
735
        ref_x = step_bbox.x1
×
736
        ref_y = step_bbox.y1
×
737

738
        # TODO Move all these constants into config, or make them adaptive?
739

740
        # Horizontal score
741
        # Diagrams to the right are preferred, but allow some overlap
742
        x_offset = diagram_bbox.x0 - ref_x
×
743

744
        if x_offset >= -50:
×
745
            # Diagram starts to the right or slightly overlapping - good
746
            # Score decreases slightly with distance to the right
747
            x_score = max(0.5, 1.0 - abs(x_offset) / 400.0)
×
748
        elif x_offset >= -200:
×
749
            # Diagram is moderately to the left - acceptable
750
            x_score = 0.3 + 0.2 * (1.0 + x_offset / 200.0)
×
751
        else:
752
            # Diagram is far to the left - poor match
753
            x_score = max(0.1, 0.3 + x_offset / 400.0)
×
754

755
        # Vertical score
756
        # Diagrams below the step header are preferred
757
        y_offset = diagram_bbox.y0 - ref_y
×
758

759
        if y_offset >= -30:
×
760
            # Diagram starts below or slightly overlapping - good
761
            # Score decreases with vertical distance
762
            y_score = max(0.3, 1.0 - abs(y_offset) / 300.0)
×
763
        elif y_offset >= -100:
×
764
            # Diagram is moderately above - less good but acceptable
765
            y_score = 0.2 + 0.3 * (1.0 + y_offset / 100.0)
×
766
        else:
767
            # Diagram is far above - poor match
768
            y_score = max(0.05, 0.2 + y_offset / 300.0)
×
769

770
        # Combined score - weight both dimensions equally
771
        score = 0.5 * x_score + 0.5 * y_score
×
772

773
        return score
×
774

775
    def _get_rotation_symbol_for_step(
1✔
776
        self,
777
        step_bbox: BBox,
778
        result: ClassificationResult,
779
    ) -> RotationSymbol | None:
780
        """Find an already-built rotation symbol within this step's area.
781

782
        Rotation symbols are built by PageClassifier before steps are processed.
783
        This method finds the already-built rotation symbol that falls within
784
        the step's bounding box.
785

786
        Args:
787
            step_bbox: The step's bounding box (including step_num, parts_list,
788
                and diagram)
789
            result: Classification result containing rotation symbol candidates
790

791
        Returns:
792
            Single RotationSymbol element for this step, or None if not found
793
        """
794
        # Get rotation_symbol candidates - they should already be built
795
        # by PageClassifier. Use valid_only=True to only get successfully
796
        # constructed rotation symbols.
797
        rotation_symbol_candidates = result.get_scored_candidates(
×
798
            "rotation_symbol", valid_only=True
799
        )
800

801
        log.debug(
×
802
            "[step] Looking for rotation symbols in step bbox %s, "
803
            "found %d built candidates",
804
            step_bbox,
805
            len(rotation_symbol_candidates),
806
        )
807

808
        if not rotation_symbol_candidates:
×
809
            return None
×
810

811
        # Find best-scoring rotation symbol overlapping the step's bbox
812
        overlapping_candidates = filter_overlapping(
×
813
            rotation_symbol_candidates, step_bbox
814
        )
815
        best_candidate = find_best_scoring(overlapping_candidates)
×
816

817
        if best_candidate and best_candidate.constructed is not None:
×
818
            rotation_symbol = best_candidate.constructed
×
819
            assert isinstance(rotation_symbol, RotationSymbol)
×
820
            log.debug(
×
821
                "[step] Found rotation symbol at %s (score=%.2f)",
822
                rotation_symbol.bbox,
823
                best_candidate.score,
824
            )
825
            return rotation_symbol
×
826

827
        log.debug("[step] No rotation symbol found in step bbox %s", step_bbox)
×
828
        return None
×
829

830
    def _get_arrows_for_step(
1✔
831
        self,
832
        step_num: StepNumber,
833
        diagram: Diagram | None,
834
        result: ClassificationResult,
835
    ) -> list[Arrow]:
836
        """Collect already-built arrows that belong to this step.
837

838
        Arrows are built in Phase 3 of build_all(), before subassemblies.
839
        This method finds arrows that are positioned near the step's diagram
840
        and assigns them to this step.
841

842
        Typically these are arrows pointing from subassembly callout boxes
843
        to the main diagram.
844

845
        Args:
846
            step_num: The step number element
847
            diagram: The diagram element (if any)
848
            result: Classification result containing arrow candidates
849

850
        Returns:
851
            List of Arrow elements for this step
852
        """
853
        # Get already-built arrows (built in Phase 3)
854
        arrow_candidates = result.get_scored_candidates(
1✔
855
            "arrow",
856
            valid_only=True,  # Only get successfully built arrows
857
        )
858

859
        log.debug(
1✔
860
            "[step] Looking for arrows for step %d, found %d built arrows",
861
            step_num.value,
862
            len(arrow_candidates),
863
        )
864

865
        if not arrow_candidates:
1✔
866
            return []
1✔
867

868
        # Determine search region: prefer diagram area, fallback to step area
869
        search_bbox = diagram.bbox if diagram else step_num.bbox
1✔
870

871
        # Expand search region to catch arrows near the diagram
872
        # Use a larger margin than rotation symbols since arrows can extend further
873
        search_region = search_bbox.expand(100.0)
1✔
874

875
        log.debug(
1✔
876
            "[step] Arrow search region for step %d: %s",
877
            step_num.value,
878
            search_region,
879
        )
880

881
        # Find arrows within or overlapping the search region
882
        arrows: list[Arrow] = []
1✔
883
        overlapping_candidates = filter_overlapping(arrow_candidates, search_region)
1✔
884

885
        for candidate in overlapping_candidates:
1✔
886
            assert candidate.constructed is not None
1✔
887
            assert isinstance(candidate.constructed, Arrow)
1✔
888
            log.debug(
1✔
889
                "[step]   Arrow at %s overlaps search region (score=%.2f)",
890
                candidate.bbox,
891
                candidate.score,
892
            )
893
            arrows.append(candidate.constructed)
1✔
894

895
        log.debug(
1✔
896
            "[step] Found %d arrows for step %d",
897
            len(arrows),
898
            step_num.value,
899
        )
900
        return arrows
1✔
901

902
    def _get_subassemblies_for_step(
1✔
903
        self,
904
        step_num: StepNumber,
905
        diagram: Diagram | None,
906
        result: ClassificationResult,
907
    ) -> list[SubAssembly]:
908
        """Get already-built subassemblies that belong to this step.
909

910
        Subassemblies are built in build_all() before steps. This method finds
911
        subassemblies that are positioned near this step's diagram and haven't
912
        been claimed by another step yet.
913

914
        Args:
915
            step_num: The step number element
916
            diagram: The diagram element (if any)
917
            result: Classification result containing subassembly candidates
918

919
        Returns:
920
            List of SubAssembly elements for this step
921
        """
922
        # Get all built subassemblies
923
        all_subassemblies: list[SubAssembly] = []
×
924
        for sa_candidate in result.get_scored_candidates(
×
925
            "subassembly", valid_only=True
926
        ):
927
            assert sa_candidate.constructed is not None
×
928
            assert isinstance(sa_candidate.constructed, SubAssembly)
×
929
            all_subassemblies.append(sa_candidate.constructed)
×
930

931
        log.debug(
×
932
            "[step] Looking for subassemblies for step %d, found %d built",
933
            step_num.value,
934
            len(all_subassemblies),
935
        )
936

937
        if not all_subassemblies:
×
938
            return []
×
939

940
        # Determine search region based on step_number and diagram
941
        reference_bbox = diagram.bbox.union(step_num.bbox) if diagram else step_num.bbox
×
942

943
        page_bbox = result.page_data.bbox
×
944

945
        # Expand search region around the reference area
946
        # Horizontally: search the full page width
947
        # Vertically: search a region around the reference bbox
948
        vertical_margin = reference_bbox.height
×
949
        search_region = BBox(
×
950
            x0=page_bbox.x0,
951
            y0=max(page_bbox.y0, reference_bbox.y0 - vertical_margin),
952
            x1=page_bbox.x1,
953
            y1=min(page_bbox.y1, reference_bbox.y1 + vertical_margin),
954
        )
955

956
        log.debug(
×
957
            "[step] SubAssembly search region for step %d: %s",
958
            step_num.value,
959
            search_region,
960
        )
961

962
        # Find subassemblies that overlap the search region
963
        subassemblies: list[SubAssembly] = []
×
964
        for subassembly in all_subassemblies:
×
965
            if subassembly.bbox.overlaps(search_region):
×
966
                log.debug(
×
967
                    "[step]   SubAssembly at %s overlaps search region",
968
                    subassembly.bbox,
969
                )
970
                subassemblies.append(subassembly)
×
971

972
        log.debug(
×
973
            "[step] Found %d subassemblies for step %d",
974
            len(subassemblies),
975
            step_num.value,
976
        )
977
        return subassemblies
×
978

979
    def _compute_step_bbox(
1✔
980
        self,
981
        step_num: StepNumber,
982
        parts_list: PartsList | None,
983
        diagram: Diagram | None,
984
        subassemblies: list[SubAssembly],
985
        page_bbox: BBox,
986
    ) -> BBox:
987
        """Compute the overall bounding box for the Step.
988

989
        This encompasses the step number, parts list (if any), diagram (if any),
990
        and all subassemblies.
991
        The result is clipped to the page bounds to handle elements that extend
992
        slightly off-page (e.g., arrows in diagrams).
993

994
        Args:
995
            step_num: The step number element
996
            parts_list: The parts list (if any)
997
            diagram: The diagram element (if any)
998
            subassemblies: List of subassemblies for this step
999
            page_bbox: The page bounding box to clip to
1000

1001
        Returns:
1002
            Combined bounding box, clipped to page bounds
1003
        """
1004
        bboxes = [step_num.bbox]
1✔
1005
        if parts_list:
1✔
1006
            bboxes.append(parts_list.bbox)
1✔
1007
        if diagram:
1✔
1008
            bboxes.append(diagram.bbox)
1✔
1009
        for sa in subassemblies:
1✔
1010
            bboxes.append(sa.bbox)
1✔
1011

1012
        return BBox.union_all(bboxes).clip_to(page_bbox)
1✔
1013

1014

1015
def assign_diagrams_to_steps(
1✔
1016
    steps: list[Step],
1017
    diagrams: list[Diagram],
1018
    max_distance: float = 500.0,
1019
) -> None:
1020
    """Assign diagrams to steps using Hungarian algorithm.
1021

1022
    Uses optimal bipartite matching to pair diagrams with steps based on
1023
    a scoring function that considers:
1024
    - Distance from step_number to diagram (closer is better)
1025
    - Relative position (diagram typically below/right of step_number)
1026

1027
    This function mutates the Step objects in place, setting their diagram
1028
    attribute.
1029

1030
    Args:
1031
        steps: List of Step objects to assign diagrams to
1032
        diagrams: List of Diagram objects to assign
1033
        max_distance: Maximum distance for a valid assignment. Pairs with distance
1034
            greater than this will not be matched.
1035
    """
1036
    if not steps or not diagrams:
1✔
1037
        log.debug(
1✔
1038
            "[step] No diagram assignment needed: steps=%d, diagrams=%d",
1039
            len(steps),
1040
            len(diagrams),
1041
        )
1042
        return
1✔
1043

1044
    n_steps = len(steps)
1✔
1045
    n_diagrams = len(diagrams)
1✔
1046

1047
    log.debug(
1✔
1048
        "[step] Running Hungarian matching for diagrams: %d steps, %d diagrams",
1049
        n_steps,
1050
        n_diagrams,
1051
    )
1052

1053
    # Build cost matrix: rows = diagrams, cols = steps
1054
    # Lower cost = better match
1055
    cost_matrix = np.zeros((n_diagrams, n_steps))
1✔
1056

1057
    for i, diag in enumerate(diagrams):
1✔
1058
        diag_center = diag.bbox.center
1✔
1059
        for j, step in enumerate(steps):
1✔
1060
            step_num_center = step.step_number.bbox.center
1✔
1061

1062
            # Base cost is distance from step_number to diagram center
1063
            distance = (
1✔
1064
                (step_num_center[0] - diag_center[0]) ** 2
1065
                + (step_num_center[1] - diag_center[1]) ** 2
1066
            ) ** 0.5
1067

1068
            # Apply position penalty: prefer diagrams that are below or to the
1069
            # right of the step_number (typical LEGO instruction layout)
1070
            # If diagram is above the step_number, add penalty
1071
            if diag_center[1] < step_num_center[1] - 50:  # Diagram above step
1✔
1072
                distance *= 1.5  # Penalty for being above
1✔
1073

1074
            cost_matrix[i, j] = distance
1✔
1075
            log.debug(
1✔
1076
                "[step]   Cost diagram[%d] at %s to step[%d]: %.1f",
1077
                i,
1078
                diag.bbox,
1079
                step.step_number.value,
1080
                distance,
1081
            )
1082

1083
    # Apply max_distance threshold
1084
    high_cost = max_distance * 10
1✔
1085
    cost_matrix_thresholded = np.where(
1✔
1086
        cost_matrix > max_distance, high_cost, cost_matrix
1087
    )
1088

1089
    # Run Hungarian algorithm
1090
    row_indices, col_indices = linear_sum_assignment(cost_matrix_thresholded)
1✔
1091

1092
    # Assign diagrams to steps based on the matching
1093
    for row_idx, col_idx in zip(row_indices, col_indices, strict=True):
1✔
1094
        if cost_matrix[row_idx, col_idx] <= max_distance:
1✔
1095
            diag = diagrams[row_idx]
1✔
1096
            step = steps[col_idx]
1✔
1097
            # Use object.__setattr__ because Step is a frozen Pydantic model
1098
            object.__setattr__(step, "diagram", diag)
1✔
1099
            log.debug(
1✔
1100
                "[step] Assigned diagram at %s to step %d (cost=%.1f)",
1101
                diag.bbox,
1102
                step.step_number.value,
1103
                cost_matrix[row_idx, col_idx],
1104
            )
1105
        else:
1106
            log.debug(
×
1107
                "[step] Skipped diagram assignment: diagram[%d] to step[%d] "
1108
                "cost=%.1f > max_distance=%.1f",
1109
                row_idx,
1110
                col_idx,
1111
                cost_matrix[row_idx, col_idx],
1112
                max_distance,
1113
            )
1114

1115

1116
def _has_divider_between(
1✔
1117
    bbox1: BBox,
1118
    bbox2: BBox,
1119
    dividers: list[Divider],
1120
) -> bool:
1121
    """Check if there is a divider line between two bboxes.
1122

1123
    A divider is considered "between" if it crosses the line connecting
1124
    the centers of the two bboxes. Dividers that are contained within
1125
    either bbox are ignored (they are internal dividers, not separating).
1126

1127
    Args:
1128
        bbox1: First bounding box
1129
        bbox2: Second bounding box
1130
        dividers: List of dividers to check
1131

1132
    Returns:
1133
        True if a divider is between the two bboxes
1134
    """
1135
    center1 = bbox1.center
1✔
1136
    center2 = bbox2.center
1✔
1137

1138
    for divider in dividers:
1✔
UNCOV
1139
        div_bbox = divider.bbox
×
1140

1141
        # Skip dividers that are fully contained within either bbox
1142
        # (these are internal dividers within a container, not separating)
UNCOV
1143
        if bbox1.contains(div_bbox) or bbox2.contains(div_bbox):
×
UNCOV
1144
            continue
×
1145

1146
        # Check if divider is vertical (separates left/right)
UNCOV
1147
        if div_bbox.width < div_bbox.height * 0.2:  # Vertical line
×
1148
            # Check if divider x is between the two centers
UNCOV
1149
            min_x = min(center1[0], center2[0])
×
UNCOV
1150
            max_x = max(center1[0], center2[0])
×
UNCOV
1151
            div_x = div_bbox.center[0]
×
UNCOV
1152
            if min_x < div_x < max_x:
×
1153
                # Check if divider spans the y range
1154
                min_y = min(center1[1], center2[1])
×
UNCOV
1155
                max_y = max(center1[1], center2[1])
×
1156
                if div_bbox.y0 <= max_y and div_bbox.y1 >= min_y:
×
1157
                    return True
×
1158

1159
        # Check if divider is horizontal (separates top/bottom)
UNCOV
1160
        elif div_bbox.height < div_bbox.width * 0.2:  # Horizontal line
×
1161
            # Check if divider y is between the two centers
1162
            min_y = min(center1[1], center2[1])
×
1163
            max_y = max(center1[1], center2[1])
×
1164
            div_y = div_bbox.center[1]
×
UNCOV
1165
            if min_y < div_y < max_y:
×
1166
                # Check if divider spans the x range
UNCOV
1167
                min_x = min(center1[0], center2[0])
×
UNCOV
1168
                max_x = max(center1[0], center2[0])
×
UNCOV
1169
                if div_bbox.x0 <= max_x and div_bbox.x1 >= min_x:
×
UNCOV
1170
                    return True
×
1171

1172
    return False
1✔
1173

1174

1175
def assign_subassemblies_to_steps(
1✔
1176
    steps: list[Step],
1177
    subassemblies: list[SubAssembly],
1178
    dividers: list[Divider],
1179
    max_distance: float = 400.0,
1180
) -> None:
1181
    """Assign subassemblies to steps using Hungarian algorithm.
1182

1183
    Uses optimal bipartite matching to pair subassemblies with steps based on:
1184
    - Distance from step's diagram to subassembly (closer is better)
1185
    - No divider between the subassembly and the step's diagram
1186

1187
    This function mutates the Step objects in place, adding to their
1188
    subassemblies list.
1189

1190
    Args:
1191
        steps: List of Step objects to assign subassemblies to
1192
        subassemblies: List of SubAssembly objects to assign
1193
        dividers: List of Divider objects for obstruction checking
1194
        max_distance: Maximum distance for a valid assignment
1195
    """
1196
    if not steps or not subassemblies:
1✔
1197
        log.debug(
1✔
1198
            "[step] No subassembly assignment needed: steps=%d, subassemblies=%d",
1199
            len(steps),
1200
            len(subassemblies),
1201
        )
1202
        return
1✔
1203

1204
    n_steps = len(steps)
1✔
1205
    n_subassemblies = len(subassemblies)
1✔
1206

1207
    log.debug(
1✔
1208
        "[step] Running Hungarian matching for subassemblies: "
1209
        "%d steps, %d subassemblies",
1210
        n_steps,
1211
        n_subassemblies,
1212
    )
1213

1214
    # Build cost matrix: rows = subassemblies, cols = steps
1215
    cost_matrix = np.zeros((n_subassemblies, n_steps))
1✔
1216
    high_cost = max_distance * 10
1✔
1217

1218
    for i, sa in enumerate(subassemblies):
1✔
1219
        sa_center = sa.bbox.center
1✔
1220
        for j, step in enumerate(steps):
1✔
1221
            # Use diagram center if available, otherwise step bbox center
1222
            if step.diagram:
1✔
1223
                target_bbox = step.diagram.bbox
1✔
1224
                target_center = target_bbox.center
1✔
1225
            else:
UNCOV
1226
                target_bbox = step.bbox
×
UNCOV
1227
                target_center = step.step_number.bbox.center
×
1228

1229
            # Base cost is distance from diagram/step to subassembly center
1230
            distance = (
1✔
1231
                (target_center[0] - sa_center[0]) ** 2
1232
                + (target_center[1] - sa_center[1]) ** 2
1233
            ) ** 0.5
1234

1235
            # Check for divider between subassembly and step's diagram
1236
            if _has_divider_between(sa.bbox, target_bbox, dividers):
1✔
1237
                # High cost if there's a divider between
UNCOV
1238
                distance = high_cost
×
UNCOV
1239
                log.debug(
×
1240
                    "[step]   Divider between subassembly[%d] at %s and step[%d]",
1241
                    i,
1242
                    sa.bbox,
1243
                    step.step_number.value,
1244
                )
1245

1246
            cost_matrix[i, j] = distance
1✔
1247
            log.debug(
1✔
1248
                "[step]   Cost subassembly[%d] at %s to step[%d]: %.1f",
1249
                i,
1250
                sa.bbox,
1251
                step.step_number.value,
1252
                distance,
1253
            )
1254

1255
    # Apply max_distance threshold
1256
    cost_matrix_thresholded = np.where(
1✔
1257
        cost_matrix > max_distance, high_cost, cost_matrix
1258
    )
1259

1260
    # Run Hungarian algorithm
1261
    row_indices, col_indices = linear_sum_assignment(cost_matrix_thresholded)
1✔
1262

1263
    # Assign subassemblies to steps based on the matching
1264
    # Note: Unlike diagrams, multiple subassemblies can be assigned to one step
1265
    # The Hungarian algorithm gives us the best 1:1 matching, but we may want
1266
    # to assign additional nearby subassemblies too
1267

1268
    # First, do the 1:1 optimal assignment
1269
    assigned_subassemblies: set[int] = set()
1✔
1270
    for row_idx, col_idx in zip(row_indices, col_indices, strict=True):
1✔
1271
        if cost_matrix[row_idx, col_idx] <= max_distance:
1✔
1272
            sa = subassemblies[row_idx]
1✔
1273
            step = steps[col_idx]
1✔
1274
            # Add to step's subassemblies list
1275
            new_subassemblies = list(step.subassemblies) + [sa]
1✔
1276
            object.__setattr__(step, "subassemblies", new_subassemblies)
1✔
1277
            assigned_subassemblies.add(row_idx)
1✔
1278
            log.debug(
1✔
1279
                "[step] Assigned subassembly at %s to step %d (cost=%.1f)",
1280
                sa.bbox,
1281
                step.step_number.value,
1282
                cost_matrix[row_idx, col_idx],
1283
            )
1284

1285
    # Then, try to assign remaining unassigned subassemblies to their nearest step
1286
    # (if within max_distance and no divider between)
1287
    for i, sa in enumerate(subassemblies):
1✔
1288
        if i in assigned_subassemblies:
1✔
1289
            continue
1✔
1290

1291
        # Find the step with lowest cost for this subassembly
UNCOV
1292
        best_step_idx = None
×
UNCOV
1293
        best_cost = high_cost
×
UNCOV
1294
        for j in range(len(steps)):
×
UNCOV
1295
            if cost_matrix[i, j] < best_cost:
×
UNCOV
1296
                best_cost = cost_matrix[i, j]
×
UNCOV
1297
                best_step_idx = j
×
1298

UNCOV
1299
        if best_step_idx is not None and best_cost <= max_distance:
×
UNCOV
1300
            step = steps[best_step_idx]
×
UNCOV
1301
            new_subassemblies = list(step.subassemblies) + [sa]
×
UNCOV
1302
            object.__setattr__(step, "subassemblies", new_subassemblies)
×
UNCOV
1303
            log.debug(
×
1304
                "[step] Assigned extra subassembly at %s to step %d (cost=%.1f)",
1305
                sa.bbox,
1306
                step.step_number.value,
1307
                best_cost,
1308
            )
1309

1310

1311
def assign_rotation_symbols_to_steps(
1✔
1312
    steps: list[Step],
1313
    rotation_symbols: list[RotationSymbol],
1314
    max_distance: float = 300.0,
1315
) -> None:
1316
    """Assign rotation symbols to steps using Hungarian algorithm.
1317

1318
    Uses optimal bipartite matching to pair rotation symbols with steps based on
1319
    minimum distance from the rotation symbol to the step's diagram (or step bbox
1320
    center if no diagram).
1321

1322
    This function mutates the Step objects in place, setting their rotation_symbol
1323
    attribute.
1324

1325
    Args:
1326
        steps: List of Step objects to assign rotation symbols to
1327
        rotation_symbols: List of RotationSymbol objects to assign
1328
        max_distance: Maximum distance for a valid assignment. Pairs with distance
1329
            greater than this will not be matched.
1330
    """
1331
    if not steps or not rotation_symbols:
1✔
1332
        log.debug(
1✔
1333
            "[step] No rotation symbol assignment needed: "
1334
            "steps=%d, rotation_symbols=%d",
1335
            len(steps),
1336
            len(rotation_symbols),
1337
        )
1338
        return
1✔
1339

UNCOV
1340
    n_steps = len(steps)
×
UNCOV
1341
    n_symbols = len(rotation_symbols)
×
1342

UNCOV
1343
    log.debug(
×
1344
        "[step] Running Hungarian matching: %d steps, %d rotation symbols",
1345
        n_steps,
1346
        n_symbols,
1347
    )
1348

1349
    # Build cost matrix: rows = rotation symbols, cols = steps
1350
    # Cost = distance from rotation symbol center to step's diagram center
1351
    # (or step bbox center if no diagram)
UNCOV
1352
    cost_matrix = np.zeros((n_symbols, n_steps))
×
1353

UNCOV
1354
    for i, rs in enumerate(rotation_symbols):
×
1355
        rs_center = rs.bbox.center
×
UNCOV
1356
        for j, step in enumerate(steps):
×
1357
            # Use diagram center if available, otherwise step bbox center
UNCOV
1358
            if step.diagram:
×
UNCOV
1359
                target_center = step.diagram.bbox.center
×
1360
            else:
UNCOV
1361
                target_center = step.bbox.center
×
1362

1363
            # Euclidean distance
UNCOV
1364
            distance = (
×
1365
                (rs_center[0] - target_center[0]) ** 2
1366
                + (rs_center[1] - target_center[1]) ** 2
1367
            ) ** 0.5
1368

UNCOV
1369
            cost_matrix[i, j] = distance
×
UNCOV
1370
            log.debug(
×
1371
                "[step]   Distance from rotation_symbol[%d] at %s to step[%d]: %.1f",
1372
                i,
1373
                rs.bbox,
1374
                step.step_number.value,
1375
                distance,
1376
            )
1377

1378
    # Apply max_distance threshold by setting costs above threshold to a high value
1379
    # This prevents assignments that are too far apart
UNCOV
1380
    high_cost = max_distance * 10  # Arbitrary large value
×
UNCOV
1381
    cost_matrix_thresholded = np.where(
×
1382
        cost_matrix > max_distance, high_cost, cost_matrix
1383
    )
1384

1385
    # Run Hungarian algorithm
UNCOV
1386
    row_indices, col_indices = linear_sum_assignment(cost_matrix_thresholded)
×
1387

1388
    # Assign rotation symbols to steps based on the matching
UNCOV
1389
    for row_idx, col_idx in zip(row_indices, col_indices, strict=True):
×
1390
        # Check if this assignment is within the max_distance threshold
UNCOV
1391
        if cost_matrix[row_idx, col_idx] <= max_distance:
×
UNCOV
1392
            rs = rotation_symbols[row_idx]
×
UNCOV
1393
            step = steps[col_idx]
×
UNCOV
1394
            step.rotation_symbol = rs
×
1395
            # Expand step bbox to include the rotation symbol
UNCOV
1396
            step.bbox = step.bbox.union(rs.bbox)
×
UNCOV
1397
            log.debug(
×
1398
                "[step] Assigned rotation symbol at %s to step %d "
1399
                "(distance=%.1f, new step bbox=%s)",
1400
                rs.bbox,
1401
                step.step_number.value,
1402
                cost_matrix[row_idx, col_idx],
1403
                step.bbox,
1404
            )
1405
        else:
UNCOV
1406
            log.debug(
×
1407
                "[step] Skipped assignment: rotation_symbol[%d] to step[%d] "
1408
                "distance=%.1f > max_distance=%.1f",
1409
                row_idx,
1410
                col_indices[row_idx] if row_idx < len(col_indices) else -1,
1411
                cost_matrix[row_idx, col_idx],
1412
                max_distance,
1413
            )
1414

1415

1416
def filter_subassembly_values[T](
1✔
1417
    items: list[tuple[int, T]],
1418
    *,
1419
    min_gap: int = 3,
1420
    max_subassembly_start: int = 3,
1421
) -> list[tuple[int, T]]:
1422
    """Filter items to exclude likely subassembly values.
1423

1424
    Subassembly steps (e.g., step 1, 2 inside a subassembly box) often appear
1425
    alongside higher-numbered page-level steps (e.g., 15, 16). This creates
1426
    sequences like [1, 2, 15, 16] which cannot all be page-level steps.
1427

1428
    This function identifies such cases by detecting a significant gap in values
1429
    and filtering out the lower-numbered items that are likely subassembly steps.
1430

1431
    Args:
1432
        items: List of (value, item) tuples where value is the step number
1433
            and item is any associated data (e.g., a Candidate).
1434
        min_gap: Minimum gap size to trigger filtering (exclusive).
1435
            Default is 3, so gaps > 3 trigger filtering.
1436
        max_subassembly_start: Maximum starting value for subassembly detection.
1437
            If min value is <= this, filtering is applied. Default is 3.
1438

1439
    Returns:
1440
        Filtered list of (value, item) tuples, keeping only the higher values
1441
        if a subassembly pattern is detected. Returns original list if no
1442
        filtering is needed.
1443

1444
    Examples:
1445
        >>> filter_subassembly_values([(1, "a"), (2, "b"), (15, "c"), (16, "d")])
1446
        [(15, "c"), (16, "d")]
1447

1448
        >>> filter_subassembly_values([(5, "a"), (6, "b"), (15, "c")])
1449
        [(5, "a"), (6, "b"), (15, "c")]  # min=5 > 3, no filtering
1450
    """
1451
    if len(items) <= 1:
1✔
1452
        return items
1✔
1453

1454
    # Sort by value
1455
    sorted_items = sorted(items, key=lambda x: x[0])
1✔
1456
    values = [v for v, _ in sorted_items]
1✔
1457

1458
    # Find the largest gap between consecutive values
1459
    max_gap_size = 0
1✔
1460
    max_gap_index = -1
1✔
1461
    for i in range(len(values) - 1):
1✔
1462
        gap = values[i + 1] - values[i]
1✔
1463
        if gap > max_gap_size:
1✔
1464
            max_gap_size = gap
1✔
1465
            max_gap_index = i
1✔
1466

1467
    # Check if filtering should occur:
1468
    # 1. Gap must be larger than min_gap
1469
    # 2. The minimum value must be <= max_subassembly_start
1470
    if max_gap_size > min_gap and max_gap_index >= 0:
1✔
1471
        min_value = values[0]
1✔
1472
        if min_value <= max_subassembly_start:
1✔
1473
            threshold = values[max_gap_index + 1]
1✔
1474
            return [(v, item) for v, item in sorted_items if v >= threshold]
1✔
1475

1476
    return items
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc