• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 20010582393

07 Dec 2025 09:22PM UTC coverage: 90.299% (-0.02%) from 90.316%
20010582393

push

github

bramp
refactor(classifiers): use BBox helpers filter_contained and filter_overlapping

5 of 8 new or added lines in 2 files covered. (62.5%)

144 existing lines in 10 files now uncovered.

10779 of 11937 relevant lines covered (90.3%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

85.13
/src/build_a_long/pdf_extract/classifier/steps/subassembly_classifier.py
1
"""
2
SubAssembly classifier.
3

4
Purpose
5
-------
6
Identify sub-assembly callout boxes on LEGO instruction pages. SubAssemblies
7
typically:
8
- Are white/light-colored rectangular boxes with black borders
9
- Are larger than individual parts (to contain a small build diagram)
10
- May contain a count label (e.g., "2x") indicating how many to build
11
- May contain step numbers for multi-step subassemblies
12
- May have an arrow pointing from them to the main diagram
13

14
Scoring is based on intrinsic properties of the box:
15
- Fill color (white/light)
16
- Size (larger than minimum threshold)
17

18
Child element discovery (step_count, step_numbers, diagrams, arrows) is
19
deferred to build time per DESIGN.md principles.
20

21
Debugging
22
---------
23
Set environment variables to aid investigation without code changes:
24

25
- LOG_LEVEL=DEBUG
26
    Enables DEBUG-level logging (if not already configured by caller).
27
"""
28

29
from __future__ import annotations
1✔
30

31
import logging
1✔
32
from typing import ClassVar
1✔
33

34
from build_a_long.pdf_extract.classifier.candidate import Candidate
1✔
35
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
36
    ClassificationResult,
37
)
38
from build_a_long.pdf_extract.classifier.config import SubAssemblyConfig
1✔
39
from build_a_long.pdf_extract.classifier.label_classifier import LabelClassifier
1✔
40
from build_a_long.pdf_extract.classifier.score import (
1✔
41
    Score,
42
    Weight,
43
    find_best_scoring,
44
)
45
from build_a_long.pdf_extract.classifier.text import extract_step_number_value
1✔
46
from build_a_long.pdf_extract.extractor.bbox import (
1✔
47
    BBox,
48
    filter_contained,
49
    filter_overlapping,
50
    group_by_similar_bbox,
51
)
52
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
53
    Diagram,
54
    StepCount,
55
    StepNumber,
56
    SubAssembly,
57
    SubAssemblyStep,
58
)
59
from build_a_long.pdf_extract.extractor.page_blocks import Blocks, Drawing, Image, Text
1✔
60

61
log = logging.getLogger(__name__)
1✔
62

63

64
class _SubAssemblyScore(Score):
1✔
65
    """Internal score representation for subassembly classification.
66

67
    Scores based on intrinsic box properties only. Child element discovery
68
    (step_count, step_numbers, diagrams, arrows) is deferred to build time.
69
    """
70

71
    box_score: float
1✔
72
    """Score based on box having white/light fill (0.0-1.0)."""
1✔
73

74
    has_step_count: bool
1✔
75
    """Whether a step_count candidate exists inside (for scoring bonus)."""
1✔
76

77
    has_diagram_or_images: bool
1✔
78
    """Whether diagram candidates or images exist inside (for scoring bonus)."""
1✔
79

80
    has_step_numbers: bool
1✔
81
    """Whether step_number candidates exist inside (for multi-step subassemblies)."""
1✔
82

83
    config: SubAssemblyConfig
1✔
84
    """Configuration containing weights for score calculation."""
1✔
85

86
    def score(self) -> Weight:
1✔
87
        """Calculate final weighted score from components."""
88
        count_score = 1.0 if self.has_step_count else 0.0
1✔
89
        diagram_score = 1.0 if self.has_diagram_or_images else 0.0
1✔
90

91
        return (
1✔
92
            self.box_score * self.config.box_shape_weight
93
            + count_score * self.config.count_weight
94
            + diagram_score * self.config.diagram_weight
95
        )
96

97

98
class SubAssemblyClassifier(LabelClassifier):
1✔
99
    """Classifier for subassembly callout boxes."""
100

101
    output: ClassVar[str] = "subassembly"
1✔
102
    requires: ClassVar[frozenset[str]] = frozenset(
1✔
103
        {"step_count", "step_number", "diagram"}
104
    )
105

106
    def _score(self, result: ClassificationResult) -> None:
1✔
107
        """Score Drawing blocks as potential subassembly boxes."""
108
        page_data = result.page_data
1✔
109
        subassembly_config = self.config.subassembly
1✔
110

111
        # Get step_count, step_number, and diagram candidates
112
        step_count_candidates = result.get_scored_candidates(
1✔
113
            "step_count", valid_only=False, exclude_failed=True
114
        )
115
        step_number_candidates = result.get_scored_candidates(
1✔
116
            "step_number", valid_only=False, exclude_failed=True
117
        )
118
        diagram_candidates = result.get_scored_candidates(
1✔
119
            "diagram", valid_only=False, exclude_failed=True
120
        )
121

122
        # Find rectangular drawing blocks that could be subassembly boxes
123
        # Filter by size constraints first
124
        max_width = page_data.bbox.width * subassembly_config.max_page_width_ratio
1✔
125
        max_height = page_data.bbox.height * subassembly_config.max_page_height_ratio
1✔
126

127
        valid_drawings: list[Drawing] = []
1✔
128
        for block in page_data.blocks:
1✔
129
            if not isinstance(block, Drawing):
1✔
130
                continue
1✔
131

132
            bbox = block.bbox
1✔
133

134
            # Skip boxes smaller than minimum subassembly size
135
            if (
1✔
136
                bbox.width < subassembly_config.min_subassembly_width
137
                or bbox.height < subassembly_config.min_subassembly_height
138
            ):
139
                continue
1✔
140

141
            # Skip boxes larger than maximum subassembly size
142
            if bbox.width > max_width or bbox.height > max_height:
1✔
143
                log.debug(
1✔
144
                    "[subassembly] Skipping oversized box at %s "
145
                    "(%.1f x %.1f > max %.1f x %.1f)",
146
                    bbox,
147
                    bbox.width,
148
                    bbox.height,
149
                    max_width,
150
                    max_height,
151
                )
152
                continue
1✔
153

154
            valid_drawings.append(block)
1✔
155

156
        # Group drawings with similar bboxes (e.g., white-filled box and
157
        # black-bordered box for the same subassembly)
158
        groups = group_by_similar_bbox(valid_drawings, tolerance=2.0)
1✔
159

160
        # Process each group - create one candidate per unique bbox region
161
        for group in groups:
1✔
162
            # Use union of all grouped drawings' bboxes
163
            bbox = BBox.union_all([d.bbox for d in group])
1✔
164

165
            # Score each drawing's colors and pick the best
166
            best_box_score = max(self._score_box_colors(d) for d in group)
1✔
167
            if best_box_score < 0.3:
1✔
168
                continue
1✔
169

170
            # Check for child elements inside the box (for scoring only)
171
            # Actual candidate discovery happens at build time
172
            has_step_count = bool(
1✔
173
                self._find_candidate_inside(bbox, step_count_candidates)
174
            )
175
            has_step_numbers = bool(
1✔
176
                self._find_all_candidates_inside(bbox, step_number_candidates)
177
            )
178
            diagrams_inside = self._find_all_diagrams_inside(bbox, diagram_candidates)
1✔
179
            images_inside = self._find_images_inside(bbox, page_data.blocks)
1✔
180
            has_diagram_or_images = bool(diagrams_inside or images_inside)
1✔
181

182
            # We need at least a box - count and diagram are optional
183
            score_details = _SubAssemblyScore(
1✔
184
                box_score=best_box_score,
185
                has_step_count=has_step_count,
186
                has_diagram_or_images=has_diagram_or_images,
187
                has_step_numbers=has_step_numbers,
188
                config=subassembly_config,
189
            )
190

191
            if score_details.score() < subassembly_config.min_score:
1✔
192
                log.debug(
1✔
193
                    "[subassembly] Rejected box at %s: score=%.2f < min_score=%.2f",
194
                    bbox,
195
                    score_details.score(),
196
                    subassembly_config.min_score,
197
                )
198
                continue
1✔
199

200
            result.add_candidate(
1✔
201
                Candidate(
202
                    bbox=bbox,
203
                    label="subassembly",
204
                    score=score_details.score(),
205
                    score_details=score_details,
206
                    source_blocks=list(group),
207
                )
208
            )
209
            log.debug(
1✔
210
                "[subassembly] Candidate at %s: has_count=%s, "
211
                "has_steps=%s, has_diagrams_or_images=%s, score=%.2f",
212
                bbox,
213
                has_step_count,
214
                has_step_numbers,
215
                has_diagram_or_images,
216
                score_details.score(),
217
            )
218

219
    def _score_box_colors(self, block: Drawing) -> float:
1✔
220
        """Score a drawing block based on having white fill.
221

222
        SubAssembly boxes typically have a white or light fill color.
223
        The outer black border boxes can be matched separately later.
224

225
        Args:
226
            block: The Drawing block to analyze
227

228
        Returns:
229
            Score from 0.0 to 1.0 where 1.0 is white fill
230
        """
231
        # Check fill color (white or light = good)
232
        if block.fill_color is not None:
1✔
233
            r, g, b = block.fill_color
1✔
234
            # Check if it's white or very light (all channels > 0.9)
235
            if r > 0.9 and g > 0.9 and b > 0.9:
1✔
236
                return 1.0
1✔
237
            # Light gray is also acceptable
238
            if r > 0.7 and g > 0.7 and b > 0.7:
1✔
239
                return 0.6
1✔
240

241
        return 0.0
1✔
242

243
    def _find_candidate_inside(
1✔
244
        self, bbox: BBox, candidates: list[Candidate]
245
    ) -> Candidate | None:
246
        """Find the best candidate that is fully inside the given box.
247

248
        Args:
249
            bbox: The bounding box of the subassembly container
250
            candidates: Candidates to search
251

252
        Returns:
253
            The best candidate inside the box, or None
254
        """
255
        return find_best_scoring(filter_contained(candidates, bbox))
1✔
256

257
    def _find_all_candidates_inside(
1✔
258
        self, bbox: BBox, candidates: list[Candidate]
259
    ) -> list[Candidate]:
260
        """Find all candidates that are fully inside the given box.
261

262
        Args:
263
            bbox: The bounding box of the subassembly container
264
            candidates: Candidates to search
265

266
        Returns:
267
            List of candidates inside the box, sorted by score (highest first)
268
        """
269
        inside = filter_contained(candidates, bbox)
1✔
270

271
        # Sort by score (highest first)
272
        inside.sort(key=lambda c: c.score, reverse=True)
1✔
273
        return inside
1✔
274

275
    def _find_diagram_inside(
1✔
276
        self, bbox: BBox, diagram_candidates: list[Candidate]
277
    ) -> Candidate | None:
278
        """Find the best diagram candidate that overlaps with the box.
279

280
        Args:
281
            bbox: The bounding box of the subassembly container
282
            diagram_candidates: Diagram candidates to search
283

284
        Returns:
285
            The best diagram candidate overlapping the box, or None
286
        """
UNCOV
287
        best_candidate = None
×
UNCOV
288
        best_overlap = 0.0
×
289

290
        # Use filter_overlapping to narrow down candidates
291
        overlapping_candidates = filter_overlapping(diagram_candidates, bbox)
×
292

UNCOV
293
        for candidate in overlapping_candidates:
×
294
            # Calculate overlap area
295
            # TODO Should this use bbox.intersection_area(candidate.bbox)?
UNCOV
296
            overlap = bbox.intersect(candidate.bbox)
×
UNCOV
297
            overlap_area = overlap.width * overlap.height
×
298
            if overlap_area > best_overlap:
×
299
                best_candidate = candidate
×
300
                best_overlap = overlap_area
×
301

302
        return best_candidate
×
303

304
    def _find_all_diagrams_inside(
1✔
305
        self, bbox: BBox, diagram_candidates: list[Candidate]
306
    ) -> list[Candidate]:
307
        """Find all diagram candidates that are fully inside the box.
308

309
        Args:
310
            bbox: The bounding box of the subassembly container
311
            diagram_candidates: Diagram candidates to search
312

313
        Returns:
314
            List of diagram candidates inside the box, sorted by area (largest first)
315
        """
316
        diagrams = filter_contained(diagram_candidates, bbox)
1✔
317
        # Sort by area (largest first)
318
        diagrams.sort(key=lambda c: c.bbox.area, reverse=True)
1✔
319
        return diagrams
1✔
320

321
    def _find_images_inside(self, bbox: BBox, blocks: list[Blocks]) -> list[Image]:
1✔
322
        """Find Image blocks that are fully inside the given box.
323

324
        This directly looks at Image blocks, bypassing the diagram clustering.
325
        Images inside subassembly boxes often get clustered with larger diagrams
326
        outside the box, so we need to find them directly.
327

328
        Args:
329
            bbox: The bounding box of the subassembly container
330
            blocks: All blocks on the page
331

332
        Returns:
333
            List of Image blocks fully inside the box, sorted by area (largest first)
334
        """
335
        min_area = 100.0  # Skip very small images (decorative elements)
1✔
336

337
        potential_images = [
1✔
338
            b for b in blocks if isinstance(b, Image) and b.bbox.area >= min_area
339
        ]
340
        images = filter_contained(potential_images, bbox)
1✔
341

342
        # Sort by area (largest first) - larger images are more likely to be diagrams
343
        images.sort(key=lambda img: img.bbox.area, reverse=True)
1✔
344
        return images
1✔
345

346
    def build(self, candidate: Candidate, result: ClassificationResult) -> SubAssembly:
1✔
347
        """Construct a SubAssembly element from a candidate.
348

349
        Child element discovery happens here at build time:
350
        - Find step_count inside the box
351
        - Find step_numbers inside the box
352
        - Find diagrams and images inside the box
353
        - Match step_numbers with diagrams/images
354
        """
355
        bbox = candidate.bbox
1✔
356
        page_data = result.page_data
1✔
357

358
        # Get candidates for child element discovery
359
        step_count_candidates = result.get_scored_candidates(
1✔
360
            "step_count", valid_only=False, exclude_failed=True
361
        )
362
        step_number_candidates = result.get_scored_candidates(
1✔
363
            "step_number", valid_only=False, exclude_failed=True
364
        )
365
        diagram_candidates = result.get_scored_candidates(
1✔
366
            "diagram", valid_only=False, exclude_failed=True
367
        )
368

369
        # Find step_count inside the box and build it
370
        count = None
1✔
371
        step_count_candidate = self._find_candidate_inside(bbox, step_count_candidates)
1✔
372
        if step_count_candidate:
1✔
373
            count_elem = result.build(step_count_candidate)
1✔
374
            assert isinstance(count_elem, StepCount)
1✔
375
            count = count_elem
1✔
376

377
        # Find step_numbers inside the box
378
        step_nums_inside = self._find_all_candidates_inside(
1✔
379
            bbox, step_number_candidates
380
        )
381

382
        # Find diagrams and images inside the box
383
        diagrams_inside = self._find_all_diagrams_inside(bbox, diagram_candidates)
1✔
384
        images_inside = self._find_images_inside(bbox, page_data.blocks)
1✔
385

386
        # Build steps if we have step numbers inside
387
        steps: list[SubAssemblyStep] = []
1✔
388
        if step_nums_inside:
1✔
389
            # Build step numbers and match them with diagrams or images
390
            steps = self._build_subassembly_steps(
1✔
391
                step_nums_inside,
392
                diagrams_inside,
393
                images_inside,
394
                result,
395
            )
396

397
        # Build a single diagram if present and no steps were built
398
        diagram = None
1✔
399
        if not steps:
1✔
400
            if diagrams_inside:
1✔
401
                diagram_elem = result.build(diagrams_inside[0])
1✔
402
                assert isinstance(diagram_elem, Diagram)
1✔
403
                diagram = diagram_elem
1✔
404
            elif images_inside:
1✔
405
                # Fall back to using an Image directly as the diagram
406
                diagram = Diagram(bbox=images_inside[0].bbox)
1✔
407

408
        # Subassemblies must contain at least one diagram
409
        # (either in steps or standalone)
410
        has_diagram = diagram is not None or any(s.diagram is not None for s in steps)
1✔
411
        if not has_diagram:
1✔
UNCOV
412
            raise ValueError(
×
413
                f"SubAssembly at {bbox} has no diagram - "
414
                "subassemblies must contain at least one diagram"
415
            )
416

417
        return SubAssembly(
1✔
418
            bbox=bbox,
419
            steps=steps,
420
            diagram=diagram,
421
            count=count,
422
        )
423

424
    def _build_subassembly_steps(
1✔
425
        self,
426
        step_number_candidates: list[Candidate],
427
        diagram_candidates: list[Candidate],
428
        images_inside: list[Image],
429
        result: ClassificationResult,
430
    ) -> list[SubAssemblyStep]:
431
        """Build SubAssemblyStep elements by matching step numbers with diagrams.
432

433
        Uses a simple heuristic: diagrams are typically to the right of and/or
434
        below the step number. For each step number, find the best matching
435
        diagram based on position. If no diagram candidates are available,
436
        uses Image blocks found directly inside the subassembly box.
437

438
        Args:
439
            step_number_candidates: Step number candidates inside the subassembly
440
            diagram_candidates: Diagram candidates inside the subassembly
441
            images_inside: Image blocks found directly inside the subassembly
442
            result: Classification result for building elements
443

444
        Returns:
445
            List of SubAssemblyStep elements, sorted by step number value
446
        """
447
        steps: list[SubAssemblyStep] = []
1✔
448
        used_diagram_ids: set[int] = set()
1✔
449
        used_image_ids: set[int] = set()
1✔
450

451
        # Sort step numbers by their value for consistent ordering
452
        sorted_step_nums = sorted(
1✔
453
            step_number_candidates,
454
            key=lambda c: self._extract_step_value(c),
455
        )
456

457
        for step_num_candidate in sorted_step_nums:
1✔
458
            # Build the step number element
459
            step_num_elem = result.build(step_num_candidate)
1✔
460
            assert isinstance(step_num_elem, StepNumber)
1✔
461

462
            # Find the best matching diagram for this step
463
            best_diagram: Diagram | None = None
1✔
464
            best_diagram_id: int | None = None
1✔
465
            best_score = -float("inf")
1✔
466

467
            # First try diagram candidates
468
            for diagram_candidate in diagram_candidates:
1✔
469
                diagram_id = id(diagram_candidate)
1✔
470
                if diagram_id in used_diagram_ids:
1✔
471
                    continue
1✔
472

473
                # Score this diagram for this step
474
                score = self._score_step_diagram_match(
1✔
475
                    step_num_candidate.bbox, diagram_candidate.bbox
476
                )
477
                if score > best_score:
1✔
478
                    best_score = score
1✔
479
                    best_diagram_id = diagram_id
1✔
480
                    # Build the diagram
481
                    diagram_elem = result.build(diagram_candidate)
1✔
482
                    assert isinstance(diagram_elem, Diagram)
1✔
483
                    best_diagram = diagram_elem
1✔
484

485
            # If no diagram candidate found, try Image blocks directly
486
            if best_diagram is None:
1✔
UNCOV
487
                best_image: Image | None = None
×
UNCOV
488
                best_image_id: int | None = None
×
489
                best_score = -float("inf")
×
490

491
                for image in images_inside:
×
UNCOV
492
                    image_id = id(image)
×
493
                    if image_id in used_image_ids:
×
494
                        continue
×
495

496
                    # Score this image for this step
UNCOV
497
                    score = self._score_step_diagram_match(
×
498
                        step_num_candidate.bbox, image.bbox
499
                    )
UNCOV
500
                    if score > best_score:
×
UNCOV
501
                        best_score = score
×
502
                        best_image_id = image_id
×
503
                        best_image = image
×
504

505
                if best_image is not None and best_image_id is not None:
×
UNCOV
506
                    used_image_ids.add(best_image_id)
×
507
                    # Create a Diagram from the Image
508
                    best_diagram = Diagram(bbox=best_image.bbox)
×
509

510
            if best_diagram_id is not None:
1✔
511
                used_diagram_ids.add(best_diagram_id)
1✔
512

513
            # Compute bbox for the step
514
            step_bbox = step_num_elem.bbox
1✔
515
            if best_diagram:
1✔
516
                step_bbox = step_bbox.union(best_diagram.bbox)
1✔
517

518
            steps.append(
1✔
519
                SubAssemblyStep(
520
                    bbox=step_bbox,
521
                    step_number=step_num_elem,
522
                    diagram=best_diagram,
523
                )
524
            )
525

UNCOV
526
        return steps
×
527

528
    def _extract_step_value(self, candidate: Candidate) -> int:
1✔
529
        """Extract the step number value from a candidate.
530

531
        Args:
532
            candidate: A step_number candidate
533

534
        Returns:
535
            The step number value, or 0 if not extractable
536
        """
537
        if candidate.source_blocks and isinstance(candidate.source_blocks[0], Text):
1✔
538
            text_block = candidate.source_blocks[0]
1✔
539
            value = extract_step_number_value(text_block.text)
1✔
540
            return value if value is not None else 0
1✔
UNCOV
541
        return 0
×
542

543
    def _score_step_diagram_match(self, step_bbox: BBox, diagram_bbox: BBox) -> float:
1✔
544
        """Score how well a diagram matches a step number in a subassembly.
545

546
        In subassemblies, diagrams are typically positioned to the right of
547
        and/or below the step number.
548

549
        Args:
550
            step_bbox: The step number bounding box
551
            diagram_bbox: The diagram bounding box
552

553
        Returns:
554
            Score (higher is better match)
555
        """
556
        # Prefer diagrams that are:
557
        # 1. To the right of the step number (positive x_offset)
558
        # 2. Below or at same level (positive or small negative y_offset)
559
        # 3. Close by (small distance)
560

561
        x_offset = diagram_bbox.x0 - step_bbox.x1
1✔
562
        y_offset = diagram_bbox.y0 - step_bbox.y0
1✔
563

564
        # X score: prefer diagrams to the right
565
        if x_offset >= 0:
1✔
566
            x_score = 1.0 - min(x_offset / 200.0, 0.5)
1✔
567
        else:
568
            x_score = 0.5 + x_offset / 100.0  # Penalize left position
1✔
569

570
        # Y score: prefer diagrams at same level or below
571
        if abs(y_offset) < 50:
1✔
572
            y_score = 1.0
1✔
573
        elif y_offset >= 0:
1✔
574
            y_score = 0.8 - min(y_offset / 200.0, 0.3)
1✔
575
        else:
UNCOV
576
            y_score = 0.5 + y_offset / 100.0  # Penalize above position
×
577

578
        return x_score + y_score
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc