• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19995046189

06 Dec 2025 10:18PM UTC coverage: 90.506% (+0.09%) from 90.421%
19995046189

push

github

bramp
test: regenerate golden files for step classifier refactoring

10525 of 11629 relevant lines covered (90.51%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

87.39
/src/build_a_long/pdf_extract/classifier/steps/subassembly_classifier.py
1
"""
2
SubAssembly classifier.
3

4
Purpose
5
-------
6
Identify sub-assembly callout boxes on LEGO instruction pages. SubAssemblies
7
typically:
8
- Are white/light-colored rectangular boxes
9
- Contain a count label (e.g., "2x") indicating how many to build
10
- Contain a small diagram/image of the sub-assembly
11
- Have an arrow pointing from them to the main diagram
12

13
Heuristic
14
---------
15
1. Find Drawing blocks that form rectangular boxes (potential subassembly containers)
16
2. Look for step_count candidates inside the boxes
17
3. Look for diagram candidates inside the boxes
18
4. Optionally find arrows near the boxes
19

20
Debugging
21
---------
22
Set environment variables to aid investigation without code changes:
23

24
- LOG_LEVEL=DEBUG
25
    Enables DEBUG-level logging (if not already configured by caller).
26
"""
27

28
from __future__ import annotations
1✔
29

30
import logging
1✔
31
from typing import ClassVar
1✔
32

33
from build_a_long.pdf_extract.classifier.candidate import Candidate
1✔
34
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
35
    ClassificationResult,
36
)
37
from build_a_long.pdf_extract.classifier.config import SubAssemblyConfig
1✔
38
from build_a_long.pdf_extract.classifier.label_classifier import LabelClassifier
1✔
39
from build_a_long.pdf_extract.classifier.score import Score, Weight
1✔
40
from build_a_long.pdf_extract.classifier.text import extract_step_number_value
1✔
41
from build_a_long.pdf_extract.extractor.bbox import BBox
1✔
42
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
43
    Diagram,
44
    StepCount,
45
    StepNumber,
46
    SubAssembly,
47
    SubAssemblyStep,
48
)
49
from build_a_long.pdf_extract.extractor.page_blocks import Blocks, Drawing, Image, Text
1✔
50

51
log = logging.getLogger(__name__)
1✔
52

53

54
class _SubAssemblyScore(Score):
1✔
55
    """Internal score representation for subassembly classification."""
56

57
    box_score: float
1✔
58
    """Score based on box having white fill / black border (0.0-1.0)."""
1✔
59

60
    count_score: float
1✔
61
    """Score for having a valid step_count candidate inside (0.0-1.0)."""
1✔
62

63
    diagram_score: float
1✔
64
    """Score for having a diagram candidate inside (0.0-1.0)."""
1✔
65

66
    step_count_candidate: Candidate | None
1✔
67
    """The step_count candidate found inside the box."""
1✔
68

69
    diagram_candidate: Candidate | None
1✔
70
    """The diagram candidate found inside the box."""
1✔
71

72
    step_number_candidates: list[Candidate]
1✔
73
    """Step number candidates found inside the box (for multi-step subassemblies)."""
1✔
74

75
    diagram_candidates: list[Candidate]
1✔
76
    """All diagram candidates inside the box (for multi-step subassemblies)."""
1✔
77

78
    images_inside: list[Image]
1✔
79
    """Image blocks found directly inside the subassembly box (not from clustering)."""
1✔
80

81
    arrow_candidate: Candidate | None
1✔
82
    """Arrow candidate pointing from/near this subassembly."""
1✔
83

84
    config: SubAssemblyConfig
1✔
85
    """Configuration containing weights for score calculation."""
1✔
86

87
    def score(self) -> Weight:
1✔
88
        """Calculate final weighted score from components."""
89
        return (
1✔
90
            self.box_score * self.config.box_shape_weight
91
            + self.count_score * self.config.count_weight
92
            + self.diagram_score * self.config.diagram_weight
93
        )
94

95

96
class SubAssemblyClassifier(LabelClassifier):
1✔
97
    """Classifier for subassembly callout boxes."""
98

99
    output: ClassVar[str] = "subassembly"
1✔
100
    requires: ClassVar[frozenset[str]] = frozenset(
1✔
101
        {"arrow", "step_count", "step_number", "diagram"}
102
    )
103

104
    def _score(self, result: ClassificationResult) -> None:
1✔
105
        """Score Drawing blocks as potential subassembly boxes."""
106
        page_data = result.page_data
1✔
107
        subassembly_config = self.config.subassembly
1✔
108

109
        # Get step_count, step_number, diagram, and arrow candidates
110
        step_count_candidates = result.get_scored_candidates(
1✔
111
            "step_count", valid_only=False, exclude_failed=True
112
        )
113
        step_number_candidates = result.get_scored_candidates(
1✔
114
            "step_number", valid_only=False, exclude_failed=True
115
        )
116
        diagram_candidates = result.get_scored_candidates(
1✔
117
            "diagram", valid_only=False, exclude_failed=True
118
        )
119
        arrow_candidates = result.get_scored_candidates(
1✔
120
            "arrow", valid_only=False, exclude_failed=True
121
        )
122

123
        # Collect all potential candidates, then deduplicate by bbox
124
        # Multiple Drawing blocks can have nearly identical bboxes (e.g.,
125
        # white-filled box and black-bordered box for the same subassembly)
126
        found_bboxes: list[BBox] = []
1✔
127

128
        # Find rectangular drawing blocks that could be subassembly boxes
129
        for block in page_data.blocks:
1✔
130
            if not isinstance(block, Drawing):
1✔
131
                continue
1✔
132

133
            bbox = block.bbox
1✔
134

135
            # Skip boxes smaller than minimum subassembly size
136
            # (subassemblies must be larger than individual parts)
137
            if (
1✔
138
                bbox.width < subassembly_config.min_subassembly_width
139
                or bbox.height < subassembly_config.min_subassembly_height
140
            ):
141
                continue
1✔
142

143
            # Skip if we've already found a similar bbox
144
            if any(bbox.similar(found, tolerance=2.0) for found in found_bboxes):
1✔
145
                log.debug(
1✔
146
                    "[subassembly] Skipping duplicate bbox at %s",
147
                    bbox,
148
                )
149
                continue
1✔
150

151
            # Score the box colors (white fill, black border)
152
            box_score = self._score_box_colors(block)
1✔
153
            if box_score < 0.3:
1✔
154
                continue
1✔
155

156
            # Find step_count candidate inside the box
157
            step_count_candidate = self._find_candidate_inside(
1✔
158
                bbox, step_count_candidates
159
            )
160
            count_score = 1.0 if step_count_candidate else 0.0
1✔
161

162
            # Find all step_number candidates inside the box
163
            step_nums_inside = self._find_all_candidates_inside(
1✔
164
                bbox, step_number_candidates
165
            )
166

167
            # Find all diagram candidates inside/overlapping the box
168
            diagrams_inside = self._find_all_diagrams_inside(bbox, diagram_candidates)
1✔
169

170
            # Find Image blocks directly inside the box (not from clustering)
171
            # This catches images that were absorbed into larger diagram clusters
172
            images_inside = self._find_images_inside(bbox, page_data.blocks)
1✔
173

174
            # For scoring, use the best/primary diagram
175
            diagram_candidate = diagrams_inside[0] if diagrams_inside else None
1✔
176
            # If we have images but no diagram candidates, still give credit
177
            has_diagram_or_images = bool(diagram_candidate or images_inside)
1✔
178
            diagram_score = 1.0 if has_diagram_or_images else 0.0
1✔
179

180
            # Find nearby arrow
181
            arrow_candidate = self._find_arrow_for_subassembly(bbox, arrow_candidates)
1✔
182

183
            # We need at least a box - count and diagram are optional
184
            score_details = _SubAssemblyScore(
1✔
185
                box_score=box_score,
186
                count_score=count_score,
187
                diagram_score=diagram_score,
188
                step_count_candidate=step_count_candidate,
189
                diagram_candidate=diagram_candidate,
190
                step_number_candidates=step_nums_inside,
191
                diagram_candidates=diagrams_inside,
192
                images_inside=images_inside,
193
                arrow_candidate=arrow_candidate,
194
                config=subassembly_config,
195
            )
196

197
            if score_details.score() < subassembly_config.min_score:
1✔
198
                log.debug(
1✔
199
                    "[subassembly] Rejected box at %s: score=%.2f < min_score=%.2f",
200
                    bbox,
201
                    score_details.score(),
202
                    subassembly_config.min_score,
203
                )
204
                continue
1✔
205

206
            # Track this bbox to avoid duplicates
207
            found_bboxes.append(bbox)
1✔
208

209
            result.add_candidate(
1✔
210
                Candidate(
211
                    bbox=bbox,
212
                    label="subassembly",
213
                    score=score_details.score(),
214
                    score_details=score_details,
215
                    source_blocks=[block],
216
                )
217
            )
218
            log.debug(
1✔
219
                "[subassembly] Candidate at %s: has_count=%s, "
220
                "has_steps=%d, has_diagrams=%d, has_images=%d, score=%.2f",
221
                bbox,
222
                step_count_candidate is not None,
223
                len(step_nums_inside),
224
                len(diagrams_inside),
225
                len(images_inside),
226
                score_details.score(),
227
            )
228

229
    def _score_box_colors(self, block: Drawing) -> float:
1✔
230
        """Score a drawing block based on having white fill.
231

232
        SubAssembly boxes typically have a white or light fill color.
233
        The outer black border boxes can be matched separately later.
234

235
        Args:
236
            block: The Drawing block to analyze
237

238
        Returns:
239
            Score from 0.0 to 1.0 where 1.0 is white fill
240
        """
241
        # Check fill color (white or light = good)
242
        if block.fill_color is not None:
1✔
243
            r, g, b = block.fill_color
1✔
244
            # Check if it's white or very light (all channels > 0.9)
245
            if r > 0.9 and g > 0.9 and b > 0.9:
1✔
246
                return 1.0
1✔
247
            # Light gray is also acceptable
248
            if r > 0.7 and g > 0.7 and b > 0.7:
1✔
249
                return 0.6
1✔
250

251
        return 0.0
1✔
252

253
    def _find_candidate_inside(
1✔
254
        self, bbox: BBox, candidates: list[Candidate]
255
    ) -> Candidate | None:
256
        """Find the best candidate that is fully inside the given box.
257

258
        Args:
259
            bbox: The bounding box of the subassembly container
260
            candidates: Candidates to search
261

262
        Returns:
263
            The best candidate inside the box, or None
264
        """
265
        best_candidate = None
1✔
266
        best_score = 0.0
1✔
267

268
        for candidate in candidates:
1✔
269
            if bbox.contains(candidate.bbox) and candidate.score > best_score:
1✔
270
                best_candidate = candidate
1✔
271
                best_score = candidate.score
1✔
272

273
        return best_candidate
1✔
274

275
    def _find_all_candidates_inside(
1✔
276
        self, bbox: BBox, candidates: list[Candidate]
277
    ) -> list[Candidate]:
278
        """Find all candidates that are fully inside the given box.
279

280
        Args:
281
            bbox: The bounding box of the subassembly container
282
            candidates: Candidates to search
283

284
        Returns:
285
            List of candidates inside the box, sorted by score (highest first)
286
        """
287
        inside: list[Candidate] = []
1✔
288

289
        for candidate in candidates:
1✔
290
            if bbox.contains(candidate.bbox):
1✔
291
                inside.append(candidate)
1✔
292

293
        # Sort by score (highest first)
294
        inside.sort(key=lambda c: c.score, reverse=True)
1✔
295
        return inside
1✔
296

297
    def _find_diagram_inside(
1✔
298
        self, bbox: BBox, diagram_candidates: list[Candidate]
299
    ) -> Candidate | None:
300
        """Find the best diagram candidate that overlaps with the box.
301

302
        Args:
303
            bbox: The bounding box of the subassembly container
304
            diagram_candidates: Diagram candidates to search
305

306
        Returns:
307
            The best diagram candidate overlapping the box, or None
308
        """
309
        best_candidate = None
×
310
        best_overlap = 0.0
×
311

312
        for candidate in diagram_candidates:
×
313
            if bbox.overlaps(candidate.bbox):
×
314
                # Calculate overlap area
315
                # TODO Should this use bbox.intersection_area(candidate.bbox)?
316
                overlap = bbox.intersect(candidate.bbox)
×
317
                overlap_area = overlap.width * overlap.height
×
318
                if overlap_area > best_overlap:
×
319
                    best_candidate = candidate
×
320
                    best_overlap = overlap_area
×
321

322
        return best_candidate
×
323

324
    def _find_all_diagrams_inside(
1✔
325
        self, bbox: BBox, diagram_candidates: list[Candidate]
326
    ) -> list[Candidate]:
327
        """Find all diagram candidates that overlap significantly with the box.
328

329
        Args:
330
            bbox: The bounding box of the subassembly container
331
            diagram_candidates: Diagram candidates to search
332

333
        Returns:
334
            List of diagram candidates overlapping the box, sorted by overlap area
335
        """
336
        diagrams: list[tuple[float, Candidate]] = []
1✔
337

338
        for candidate in diagram_candidates:
1✔
339
            if bbox.overlaps(candidate.bbox):
1✔
340
                # Calculate overlap area
341
                overlap = bbox.intersect(candidate.bbox)
1✔
342
                overlap_area = overlap.width * overlap.height
1✔
343
                # Only include if significant overlap (at least 50% inside the box)
344
                candidate_area = candidate.bbox.area
1✔
345
                if candidate_area > 0 and overlap_area / candidate_area >= 0.5:
1✔
346
                    diagrams.append((overlap_area, candidate))
1✔
347

348
        # Sort by overlap area (largest first)
349
        diagrams.sort(key=lambda x: x[0], reverse=True)
1✔
350
        return [c for _, c in diagrams]
1✔
351

352
    def _find_images_inside(self, bbox: BBox, blocks: list[Blocks]) -> list[Image]:
1✔
353
        """Find Image blocks that are fully inside the given box.
354

355
        This directly looks at Image blocks, bypassing the diagram clustering.
356
        Images inside subassembly boxes often get clustered with larger diagrams
357
        outside the box, so we need to find them directly.
358

359
        Args:
360
            bbox: The bounding box of the subassembly container
361
            blocks: All blocks on the page
362

363
        Returns:
364
            List of Image blocks fully inside the box, sorted by area (largest first)
365
        """
366
        images: list[Image] = []
1✔
367
        min_area = 100.0  # Skip very small images (decorative elements)
1✔
368

369
        for block in blocks:
1✔
370
            if not isinstance(block, Image):
1✔
371
                continue
1✔
372
            if block.bbox.area < min_area:
1✔
373
                continue
1✔
374
            if bbox.contains(block.bbox):
1✔
375
                images.append(block)
1✔
376

377
        # Sort by area (largest first) - larger images are more likely to be diagrams
378
        images.sort(key=lambda img: img.bbox.area, reverse=True)
1✔
379
        return images
1✔
380

381
    def _find_arrow_for_subassembly(
1✔
382
        self, bbox: BBox, arrow_candidates: list[Candidate]
383
    ) -> Candidate | None:
384
        """Find an arrow that points from/near this subassembly box.
385

386
        Looks for arrows that are either:
387
        - Inside the box
388
        - Adjacent to the box (within a small margin)
389

390
        Args:
391
            bbox: The bounding box of the subassembly container
392
            arrow_candidates: All arrow candidates on the page
393

394
        Returns:
395
            The best matching arrow candidate, or None
396
        """
397
        margin = 20.0  # Points of margin around the box
1✔
398
        expanded_bbox = BBox(
1✔
399
            x0=bbox.x0 - margin,
400
            y0=bbox.y0 - margin,
401
            x1=bbox.x1 + margin,
402
            y1=bbox.y1 + margin,
403
        )
404

405
        best_arrow = None
1✔
406
        best_score = 0.0
1✔
407

408
        for arrow_candidate in arrow_candidates:
1✔
409
            if (
1✔
410
                expanded_bbox.overlaps(arrow_candidate.bbox)
411
                and arrow_candidate.score > best_score
412
            ):
413
                best_arrow = arrow_candidate
1✔
414
                best_score = arrow_candidate.score
1✔
415

416
        return best_arrow
1✔
417

418
    def build(self, candidate: Candidate, result: ClassificationResult) -> SubAssembly:
1✔
419
        """Construct a SubAssembly element from a candidate."""
420
        score_details = candidate.score_details
1✔
421
        assert isinstance(score_details, _SubAssemblyScore)
1✔
422

423
        # Build the step_count element if present
424
        count = None
1✔
425
        if score_details.step_count_candidate:
1✔
426
            count_elem = result.build(score_details.step_count_candidate)
1✔
427
            assert isinstance(count_elem, StepCount)
1✔
428
            count = count_elem
1✔
429

430
        # Build steps if we have step numbers inside
431
        steps: list[SubAssemblyStep] = []
1✔
432
        if score_details.step_number_candidates:
1✔
433
            # Build step numbers and match them with diagrams or images
434
            steps = self._build_subassembly_steps(
1✔
435
                score_details.step_number_candidates,
436
                score_details.diagram_candidates,
437
                score_details.images_inside,
438
                result,
439
            )
440

441
        # Build a single diagram if present and no steps were built
442
        diagram = None
1✔
443
        if not steps and score_details.diagram_candidate:
1✔
444
            diagram_elem = result.build(score_details.diagram_candidate)
1✔
445
            assert isinstance(diagram_elem, Diagram)
1✔
446
            diagram = diagram_elem
1✔
447

448
        return SubAssembly(
1✔
449
            bbox=candidate.bbox,
450
            steps=steps,
451
            diagram=diagram,
452
            count=count,
453
        )
454

455
    def _build_subassembly_steps(
1✔
456
        self,
457
        step_number_candidates: list[Candidate],
458
        diagram_candidates: list[Candidate],
459
        images_inside: list[Image],
460
        result: ClassificationResult,
461
    ) -> list[SubAssemblyStep]:
462
        """Build SubAssemblyStep elements by matching step numbers with diagrams.
463

464
        Uses a simple heuristic: diagrams are typically to the right of and/or
465
        below the step number. For each step number, find the best matching
466
        diagram based on position. If no diagram candidates are available,
467
        uses Image blocks found directly inside the subassembly box.
468

469
        Args:
470
            step_number_candidates: Step number candidates inside the subassembly
471
            diagram_candidates: Diagram candidates inside the subassembly
472
            images_inside: Image blocks found directly inside the subassembly
473
            result: Classification result for building elements
474

475
        Returns:
476
            List of SubAssemblyStep elements, sorted by step number value
477
        """
478
        steps: list[SubAssemblyStep] = []
1✔
479
        used_diagram_ids: set[int] = set()
1✔
480
        used_image_ids: set[int] = set()
1✔
481

482
        # Sort step numbers by their value for consistent ordering
483
        sorted_step_nums = sorted(
1✔
484
            step_number_candidates,
485
            key=lambda c: self._extract_step_value(c),
486
        )
487

488
        for step_num_candidate in sorted_step_nums:
1✔
489
            # Build the step number element
490
            step_num_elem = result.build(step_num_candidate)
1✔
491
            assert isinstance(step_num_elem, StepNumber)
1✔
492

493
            # Find the best matching diagram for this step
494
            best_diagram: Diagram | None = None
1✔
495
            best_diagram_id: int | None = None
1✔
496
            best_score = -float("inf")
1✔
497

498
            # First try diagram candidates
499
            for diagram_candidate in diagram_candidates:
1✔
500
                diagram_id = id(diagram_candidate)
1✔
501
                if diagram_id in used_diagram_ids:
1✔
502
                    continue
1✔
503

504
                # Score this diagram for this step
505
                score = self._score_step_diagram_match(
1✔
506
                    step_num_candidate.bbox, diagram_candidate.bbox
507
                )
508
                if score > best_score:
1✔
509
                    best_score = score
1✔
510
                    best_diagram_id = diagram_id
1✔
511
                    # Build the diagram
512
                    diagram_elem = result.build(diagram_candidate)
1✔
513
                    assert isinstance(diagram_elem, Diagram)
1✔
514
                    best_diagram = diagram_elem
1✔
515

516
            # If no diagram candidate found, try Image blocks directly
517
            if best_diagram is None:
1✔
518
                best_image: Image | None = None
×
519
                best_image_id: int | None = None
×
520
                best_score = -float("inf")
×
521

522
                for image in images_inside:
×
523
                    image_id = id(image)
×
524
                    if image_id in used_image_ids:
×
525
                        continue
×
526

527
                    # Score this image for this step
528
                    score = self._score_step_diagram_match(
×
529
                        step_num_candidate.bbox, image.bbox
530
                    )
531
                    if score > best_score:
×
532
                        best_score = score
×
533
                        best_image_id = image_id
×
534
                        best_image = image
×
535

536
                if best_image is not None and best_image_id is not None:
×
537
                    used_image_ids.add(best_image_id)
×
538
                    # Create a Diagram from the Image
539
                    best_diagram = Diagram(bbox=best_image.bbox)
×
540

541
            if best_diagram_id is not None:
1✔
542
                used_diagram_ids.add(best_diagram_id)
1✔
543

544
            # Compute bbox for the step
545
            step_bbox = step_num_elem.bbox
1✔
546
            if best_diagram:
1✔
547
                step_bbox = step_bbox.union(best_diagram.bbox)
1✔
548

549
            steps.append(
1✔
550
                SubAssemblyStep(
551
                    bbox=step_bbox,
552
                    step_number=step_num_elem,
553
                    diagram=best_diagram,
554
                )
555
            )
556

557
        return steps
×
558

559
    def _extract_step_value(self, candidate: Candidate) -> int:
1✔
560
        """Extract the step number value from a candidate.
561

562
        Args:
563
            candidate: A step_number candidate
564

565
        Returns:
566
            The step number value, or 0 if not extractable
567
        """
568
        if candidate.source_blocks and isinstance(candidate.source_blocks[0], Text):
1✔
569
            text_block = candidate.source_blocks[0]
1✔
570
            value = extract_step_number_value(text_block.text)
1✔
571
            return value if value is not None else 0
1✔
572
        return 0
×
573

574
    def _score_step_diagram_match(self, step_bbox: BBox, diagram_bbox: BBox) -> float:
1✔
575
        """Score how well a diagram matches a step number in a subassembly.
576

577
        In subassemblies, diagrams are typically positioned to the right of
578
        and/or below the step number.
579

580
        Args:
581
            step_bbox: The step number bounding box
582
            diagram_bbox: The diagram bounding box
583

584
        Returns:
585
            Score (higher is better match)
586
        """
587
        # Prefer diagrams that are:
588
        # 1. To the right of the step number (positive x_offset)
589
        # 2. Below or at same level (positive or small negative y_offset)
590
        # 3. Close by (small distance)
591

592
        x_offset = diagram_bbox.x0 - step_bbox.x1
1✔
593
        y_offset = diagram_bbox.y0 - step_bbox.y0
1✔
594

595
        # X score: prefer diagrams to the right
596
        if x_offset >= 0:
1✔
597
            x_score = 1.0 - min(x_offset / 200.0, 0.5)
1✔
598
        else:
599
            x_score = 0.5 + x_offset / 100.0  # Penalize left position
1✔
600

601
        # Y score: prefer diagrams at same level or below
602
        if abs(y_offset) < 50:
1✔
603
            y_score = 1.0
1✔
604
        elif y_offset >= 0:
1✔
605
            y_score = 0.8 - min(y_offset / 200.0, 0.3)
1✔
606
        else:
607
            y_score = 0.5 + y_offset / 100.0  # Penalize above position
×
608

609
        return x_score + y_score
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc