• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 20401440535

20 Dec 2025 11:19PM UTC coverage: 89.38% (+0.01%) from 89.367%
20401440535

push

github

bramp
Standardize RuleBasedClassifier effect finding and resolve PartCount conflicts

- Set default effects_margin to 2.0 in RuleBasedClassifier.
- Removed effects_max_area_ratio and effects_target_types from base class to simplify API.
- Implemented local filtering in PartCountClassifier and PartNumberClassifier to only consume Drawing effects, preventing conflicts with PartImage Image blocks.
- Opted-out non-content classifiers (Divider, ProgressBarBar, etc.) from automatic effect finding to maintain behavior.
- Cleaned up ProgressBarIndicatorClassifier to use standard base class properties.
- Updated golden files for pages 072 and 176 to reflect minor bbox changes from refined PartCount behavior.

37 of 39 new or added lines in 10 files covered. (94.87%)

26 existing lines in 7 files now uncovered.

13701 of 15329 relevant lines covered (89.38%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.0
/src/build_a_long/pdf_extract/classifier/steps/arrow_classifier.py
1
"""
2
Arrow classifier.
3

4
Purpose
5
-------
6
Identify arrows on LEGO instruction pages. Arrows typically:
7
- Point from a main assembly to a sub-step callout
8
- Indicate direction of motion or insertion
9
- Connect related elements visually
10

11
Heuristic
12
---------
13
1. Find Drawing blocks with triangular shapes (3-4 line items) - the arrowhead
14
2. Filter to small filled shapes (5-20px, filled color)
15
3. Calculate the tip (furthest point from centroid)
16
4. Calculate direction angle from centroid to tip
17
5. Search for an adjacent thin rectangle (the shaft) that connects to the arrowhead
18
6. Trace the shaft to find the tail point (far end from arrowhead)
19

20
Arrows consist of:
21
- Arrowhead: A small filled triangular shape (3-4 line items)
22
- Shaft: A thin filled rectangle adjacent to the arrowhead base
23

24
Debugging
25
---------
26
Enable with `LOG_LEVEL=DEBUG` for structured logs.
27
"""
28

29
from __future__ import annotations
1✔
30

31
import logging
1✔
32
import math
1✔
33
from typing import ClassVar
1✔
34

35
from pydantic import BaseModel
1✔
36

37
from build_a_long.pdf_extract.classifier.candidate import Candidate
1✔
38
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
39
    ClassificationResult,
40
)
41
from build_a_long.pdf_extract.classifier.label_classifier import LabelClassifier
1✔
42
from build_a_long.pdf_extract.classifier.score import Score, Weight
1✔
43
from build_a_long.pdf_extract.classifier.utils import (
1✔
44
    colors_match,
45
    extract_unique_points,
46
)
47
from build_a_long.pdf_extract.extractor.bbox import BBox, filter_overlapping
1✔
48
from build_a_long.pdf_extract.extractor.lego_page_elements import Arrow, ArrowHead
1✔
49
from build_a_long.pdf_extract.extractor.page_blocks import Blocks, Drawing
1✔
50

51
log = logging.getLogger(__name__)
1✔
52

53

54
class _ArrowHeadData(BaseModel):
1✔
55
    """Data for a single arrowhead within an arrow."""
56

57
    tip: tuple[float, float]
1✔
58
    """The tip point (x, y) - where the arrowhead points TO."""
1✔
59

60
    direction: float
1✔
61
    """Direction angle in degrees (0=right, 90=down, 180=left, -90=up)."""
1✔
62

63
    shape_score: float
1✔
64
    """Score based on shape being triangular (0.0-1.0)."""
1✔
65

66
    size_score: float
1✔
67
    """Score based on size being in expected range (0.0-1.0)."""
1✔
68

69
    block: Drawing
1✔
70
    """The Drawing block for this arrowhead."""
1✔
71

72
    shaft_block: Drawing | None = None
1✔
73
    """The shaft Drawing block, if detected."""
1✔
74

75
    tail: tuple[float, float] | None = None
1✔
76
    """The tail/origin point where the shaft starts. None if no shaft detected."""
1✔
77

78

79
class _ArrowScore(Score):
1✔
80
    """Score representation for an arrow (one or more arrowheads + optional shaft)."""
81

82
    heads: list[_ArrowHeadData]
1✔
83
    """Data for each arrowhead in this arrow."""
1✔
84

85
    tail: tuple[float, float] | None = None
1✔
86
    """The tail/origin point where the shaft starts. None if no shaft detected."""
1✔
87

88
    shaft_block: Drawing | None = None
1✔
89
    """The shaft Drawing block, if detected."""
1✔
90

91
    # Weights for score calculation
92
    shape_weight: float = 0.7
1✔
93
    size_weight: float = 0.3
1✔
94

95
    def score(self) -> Weight:
1✔
96
        """Return the average score of all arrowheads."""
97
        if not self.heads:
1✔
98
            return 0.0
×
99
        total = sum(
1✔
100
            h.shape_score * self.shape_weight + h.size_score * self.size_weight
101
            for h in self.heads
102
        )
103
        return total / len(self.heads)
1✔
104

105

106
class ArrowClassifier(LabelClassifier):
1✔
107
    """Classifier for arrow elements (arrowheads).
108

109
    Implementation Pattern: Arrowhead + Shaft Discovery
110
    ----------------------------------------------------
111
    This classifier scores arrowheads using geometric rules, then discovers
112
    shaft blocks during scoring. This pattern is justified because:
113

114
    1. **Single Visual Element**: Arrowhead + shaft form a single arrow element
115
    2. **Intrinsic Composition**: The shaft is a direct visual extension of the
116
       arrowhead, not a separate classified element
117
    3. **Spatial Relationship**: Shaft discovery is based on geometric properties
118
       (position, alignment) relative to the arrowhead
119

120
    The shaft blocks are included in source_blocks, ensuring they're consumed
121
    together when the arrow candidate wins.
122
    """
123

124
    output: ClassVar[str] = "arrow"
1✔
125
    requires: ClassVar[frozenset[str]] = frozenset()
1✔
126

127
    def _score(self, result: ClassificationResult) -> None:
1✔
128
        """Score Drawing blocks as potential arrowheads and group by shared tail."""
129
        page_data = result.page_data
1✔
130
        arrow_config = self.config.arrow
1✔
131

132
        all_drawings = [
1✔
133
            block for block in page_data.blocks if isinstance(block, Drawing)
134
        ]
135

136
        # Phase 1: Find all valid arrowheads
137
        arrowheads: list[_ArrowHeadData] = []
1✔
138
        for block in all_drawings:
1✔
139
            head = self._score_arrowhead(block, all_drawings)
1✔
140
            if head is None:
1✔
141
                continue
1✔
142

143
            head_score = (
1✔
144
                head.shape_score * arrow_config.shape_weight
145
                + head.size_score * arrow_config.size_weight
146
            )
147
            if head_score < arrow_config.min_score:
1✔
UNCOV
148
                log.debug(
×
149
                    "[arrow] Rejected at %s: score=%.2f < min_score=%.2f",
150
                    block.bbox,
151
                    head_score,
152
                    arrow_config.min_score,
153
                )
UNCOV
154
                continue
×
155

156
            arrowheads.append(head)
1✔
157

158
        # Phase 2: Group arrowheads that share the same shaft or have nearby tails
159
        # This handles:
160
        # - Y-shaped arrows: multiple heads with tails close together
161
        # - L-shaped arrows: multiple heads sharing the same shaft block
162
        tolerance = arrow_config.tail_grouping_tolerance
1✔
163
        groups = self._group_arrowheads(arrowheads, tolerance)
1✔
164

165
        # Phase 3: Create candidates
166
        for heads in groups:
1✔
167
            self._add_arrow_candidate(result, heads)
1✔
168

169
    def _group_arrowheads(
1✔
170
        self, arrowheads: list[_ArrowHeadData], tail_tolerance: float
171
    ) -> list[list[_ArrowHeadData]]:
172
        """Group arrowheads that belong to the same arrow.
173

174
        Arrowheads are grouped together if they:
175
        1. Share the same shaft_block (same object identity), OR
176
        2. Have tails within tail_tolerance distance of each other
177

178
        Args:
179
            arrowheads: List of arrowhead data to group
180
            tail_tolerance: Maximum distance between tail coordinates to be grouped
181

182
        Returns:
183
            List of groups, where each group is a list of arrowheads
184
        """
185
        if not arrowheads:
1✔
186
            return []
1✔
187

188
        # Use union-find to group arrowheads
189
        # Each arrowhead starts in its own group
190
        parent: dict[int, int] = {i: i for i in range(len(arrowheads))}
1✔
191

192
        def find(x: int) -> int:
1✔
193
            if parent[x] != x:
1✔
194
                parent[x] = find(parent[x])
1✔
195
            return parent[x]
1✔
196

197
        def union(x: int, y: int) -> None:
1✔
198
            px, py = find(x), find(y)
1✔
199
            if px != py:
1✔
200
                parent[px] = py
1✔
201

202
        # Group by shared shaft_block (same object identity)
203
        shaft_to_indices: dict[int, list[int]] = {}
1✔
204
        for i, head in enumerate(arrowheads):
1✔
205
            if head.shaft_block is not None:
1✔
206
                shaft_id = id(head.shaft_block)
1✔
207
                if shaft_id in shaft_to_indices:
1✔
208
                    # Union with first arrowhead sharing this shaft
209
                    union(i, shaft_to_indices[shaft_id][0])
1✔
210
                    shaft_to_indices[shaft_id].append(i)
1✔
211
                else:
212
                    shaft_to_indices[shaft_id] = [i]
1✔
213

214
        # Group by tail proximity
215
        for i, head_i in enumerate(arrowheads):
1✔
216
            if head_i.tail is None:
1✔
217
                continue
1✔
218
            for j, head_j in enumerate(arrowheads):
1✔
219
                if i >= j or head_j.tail is None:
1✔
220
                    continue
1✔
221
                dx = head_i.tail[0] - head_j.tail[0]
1✔
222
                dy = head_i.tail[1] - head_j.tail[1]
1✔
223
                dist = math.sqrt(dx * dx + dy * dy)
1✔
224
                if dist <= tail_tolerance:
1✔
225
                    union(i, j)
1✔
226

227
        # Collect groups
228
        group_map: dict[int, list[_ArrowHeadData]] = {}
1✔
229
        for i, head in enumerate(arrowheads):
1✔
230
            root = find(i)
1✔
231
            group_map.setdefault(root, []).append(head)
1✔
232

233
        return list(group_map.values())
1✔
234

235
    def _add_arrow_candidate(
1✔
236
        self, result: ClassificationResult, heads: list[_ArrowHeadData]
237
    ) -> None:
238
        """Create and add an arrow candidate from arrowhead data."""
239
        # Collect source blocks (deduplicated by object identity)
240
        seen_ids: set[int] = set()
1✔
241
        source_blocks: list[Blocks] = []
1✔
242
        for head in heads:
1✔
243
            if id(head.block) not in seen_ids:
1✔
244
                seen_ids.add(id(head.block))
1✔
245
                source_blocks.append(head.block)
1✔
246
            if head.shaft_block is not None and id(head.shaft_block) not in seen_ids:
1✔
247
                seen_ids.add(id(head.shaft_block))
1✔
248
                source_blocks.append(head.shaft_block)
1✔
249

250
        # Compute combined bbox
251
        arrow_bbox = BBox.union_all([b.bbox for b in source_blocks])
1✔
252

253
        # Get shared tail and shaft (if any)
254
        tail = next((h.tail for h in heads if h.tail), None)
1✔
255
        shaft_block = next((h.shaft_block for h in heads if h.shaft_block), None)
1✔
256

257
        arrow_score = _ArrowScore(
1✔
258
            heads=heads,
259
            tail=tail,
260
            shaft_block=shaft_block,
261
            shape_weight=self.config.arrow.shape_weight,
262
            size_weight=self.config.arrow.size_weight,
263
        )
264

265
        result.add_candidate(
1✔
266
            Candidate(
267
                bbox=arrow_bbox,
268
                label="arrow",
269
                score=arrow_score.score(),
270
                score_details=arrow_score,
271
                source_blocks=source_blocks,
272
            )
273
        )
274

275
        if len(heads) == 1:
1✔
276
            log.debug(
1✔
277
                "[arrow] Candidate at %s: score=%.2f, direction=%.0f°",
278
                arrow_bbox,
279
                arrow_score.score(),
280
                heads[0].direction,
281
            )
282
        else:
283
            log.debug(
1✔
284
                "[arrow] Candidate (multi-head) at %s: score=%.2f, heads=%d",
285
                arrow_bbox,
286
                arrow_score.score(),
287
                len(heads),
288
            )
289

290
    def _score_arrowhead(
1✔
291
        self, block: Drawing, all_drawings: list[Drawing]
292
    ) -> _ArrowHeadData | None:
293
        """Score a Drawing block as a potential arrowhead.
294

295
        Args:
296
            block: The Drawing block to score
297
            all_drawings: All Drawing blocks on the page (for shaft searching)
298

299
        Returns:
300
            ArrowHeadData if this is a valid arrowhead, None otherwise.
301
            Includes shaft_block and tail if a shaft was found.
302
        """
303
        arrow_config = self.config.arrow
1✔
304
        bbox = block.bbox
1✔
305
        items = block.items
1✔
306

307
        # Must have items
308
        if not items:
1✔
309
            return None
1✔
310

311
        # Must be filled (arrowheads are filled shapes)
312
        if not block.fill_color:
1✔
313
            return None
1✔
314

315
        # Check size constraints
316
        if bbox.width < arrow_config.min_size or bbox.width > arrow_config.max_size:
1✔
317
            return None
1✔
318
        if bbox.height < arrow_config.min_size or bbox.height > arrow_config.max_size:
1✔
319
            return None
1✔
320

321
        # Check aspect ratio (triangles are roughly square-ish to elongated)
322
        aspect = bbox.width / bbox.height if bbox.height > 0 else 0
1✔
323
        if (
1✔
324
            aspect < arrow_config.min_aspect_ratio
325
            or aspect > arrow_config.max_aspect_ratio
326
        ):
327
            return None
1✔
328

329
        # Must have 3-5 line items forming the shape
330
        line_items = [item for item in items if item[0] == "l"]
1✔
331
        if len(line_items) < 3 or len(line_items) > 5:
1✔
332
            return None
1✔
333

334
        # All items should be lines (no curves, rectangles, etc.)
335
        if len(line_items) != len(items):
1✔
336
            return None
1✔
337

338
        # Extract unique points from line items
339
        points = extract_unique_points(line_items)
1✔
340
        if len(points) < 3 or len(points) > 5:
1✔
UNCOV
341
            return None
×
342

343
        # Calculate centroid
344
        cx = sum(p[0] for p in points) / len(points)
1✔
345
        cy = sum(p[1] for p in points) / len(points)
1✔
346

347
        # Find the tip (point furthest from centroid)
348
        max_dist = 0.0
1✔
349
        tip = points[0]
1✔
350
        for p in points:
1✔
351
            dist = math.sqrt((p[0] - cx) ** 2 + (p[1] - cy) ** 2)
1✔
352
            if dist > max_dist:
1✔
353
                max_dist = dist
1✔
354
                tip = p
1✔
355

356
        # Calculate direction from centroid to tip
357
        direction = math.degrees(math.atan2(tip[1] - cy, tip[0] - cx))
1✔
358

359
        # Score the shape (more points closer to triangle = better)
360
        # Ideal triangle has 3-4 points
361
        if len(points) == 3:
1✔
362
            shape_score = 1.0
1✔
363
        elif len(points) == 4:
1✔
364
            shape_score = 0.9
1✔
365
        else:
UNCOV
366
            shape_score = 0.7
×
367

368
        # Score the size (prefer sizes closer to ideal)
369
        ideal_size = arrow_config.ideal_size
1✔
370
        size_diff = abs(bbox.width - ideal_size) + abs(bbox.height - ideal_size)
1✔
371
        size_score = max(0.0, 1.0 - (size_diff / (ideal_size * 2)))
1✔
372

373
        # Try to find a shaft for this arrowhead
374
        shaft_block: Drawing | None = None
1✔
375
        tail: tuple[float, float] | None = None
1✔
376
        shaft_result = self._find_shaft(block, direction, tip, all_drawings)
1✔
377
        if shaft_result is not None:
1✔
378
            shaft_block, tail = shaft_result
1✔
379
            log.debug(
1✔
380
                "[arrow] Found shaft for arrowhead at %s, tail at %s",
381
                block.bbox,
382
                tail,
383
            )
384

385
        return _ArrowHeadData(
1✔
386
            tip=tip,
387
            direction=direction,
388
            shape_score=shape_score,
389
            size_score=size_score,
390
            block=block,
391
            shaft_block=shaft_block,
392
            tail=tail,
393
        )
394

395
    def _find_shaft(
1✔
396
        self,
397
        arrowhead: Drawing,
398
        direction: float,
399
        tip: tuple[float, float],
400
        all_drawings: list[Drawing],
401
    ) -> tuple[Drawing, tuple[float, float]] | None:
402
        """Find the shaft connected to an arrowhead.
403

404
        The shaft can be:
405
        1. A thin filled rectangle (single "re" item)
406
        2. A stroked line (single "l" item with stroke_color)
407
        3. A path with multiple "l" (line) items - L-shaped shaft
408

409
        The shaft must:
410
        - Be positioned adjacent to the arrowhead base (opposite the tip)
411
        - Have a color matching the arrowhead's fill_color (either fill or stroke)
412

413
        All shaft types are handled uniformly by extracting their endpoints
414
        and finding the point closest to the tip (connection) and furthest
415
        from the tip (tail).
416

417
        Args:
418
            arrowhead: The arrowhead Drawing block
419
            direction: Direction angle in degrees (0=right, 90=down, etc.)
420
            tip: The tip point (x, y) of the arrowhead
421
            all_drawings: All Drawing blocks on the page to search
422

423
        Returns:
424
            Tuple of (shaft Drawing block, tail point) if found, None otherwise
425
        """
426
        # Optimization: Filter to drawings near the arrowhead
427
        # Shaft must connect to arrowhead, so it must overlap a slightly expanded bbox
428
        search_bbox = arrowhead.bbox.expand(20.0)  # generous margin
1✔
429
        nearby_drawings = filter_overlapping(all_drawings, search_bbox)
1✔
430

431
        # Configuration
432
        max_connection_distance = 15.0  # pixels - max gap between shaft and arrowhead
1✔
433
        min_shaft_length = 10.0  # pixels - minimum distance from connection to tail
1✔
434
        max_shaft_thickness = 5.0  # pixels - for thin rect shafts
1✔
435

436
        best_shaft: Drawing | None = None
1✔
437
        best_tail: tuple[float, float] | None = None
1✔
438
        best_distance = float("inf")
1✔
439

440
        for drawing in nearby_drawings:
1✔
441
            if drawing is arrowhead:
1✔
442
                continue
1✔
443

444
            # Check color match - shaft color must match arrowhead fill_color
445
            # Shaft can be either filled (fill_color) or stroked (stroke_color)
446
            shaft_color = drawing.fill_color or drawing.stroke_color
1✔
447
            if not shaft_color:
1✔
UNCOV
448
                continue
×
449

450
            if arrowhead.fill_color and not colors_match(
1✔
451
                arrowhead.fill_color, shaft_color
452
            ):
453
                continue
1✔
454

455
            items = drawing.items
1✔
456
            if not items:
1✔
UNCOV
457
                continue
×
458

459
            # Reject if there are curve items - those are typically not shafts
460
            if any(item[0] == "c" for item in items):
1✔
UNCOV
461
                continue
×
462

463
            # For thin rectangles, check thickness constraint
464
            if len(items) == 1 and items[0][0] == "re":
1✔
465
                bbox = drawing.bbox
1✔
466
                thickness = min(bbox.width, bbox.height)
1✔
467
                if thickness > max_shaft_thickness:
1✔
468
                    continue
1✔
469

470
            # Extract all points from the drawing
471
            points = self._extract_path_points(items)
1✔
472
            if len(points) < 2:
1✔
UNCOV
473
                continue
×
474

475
            # Find the point closest to the arrowhead tip (connection point)
476
            closest_point = None
1✔
477
            closest_dist = float("inf")
1✔
478
            for p in points:
1✔
479
                dist = math.sqrt((p[0] - tip[0]) ** 2 + (p[1] - tip[1]) ** 2)
1✔
480
                if dist < closest_dist:
1✔
481
                    closest_dist = dist
1✔
482
                    closest_point = p
1✔
483

484
            # Check if this shaft connects to the arrowhead
485
            if closest_dist > max_connection_distance:
1✔
486
                continue
1✔
487

488
            # Find the tail point (furthest point from the tip)
489
            tail_point = None
1✔
490
            furthest_dist = 0.0
1✔
491
            for p in points:
1✔
492
                dist = math.sqrt((p[0] - tip[0]) ** 2 + (p[1] - tip[1]) ** 2)
1✔
493
                if dist > furthest_dist:
1✔
494
                    furthest_dist = dist
1✔
495
                    tail_point = p
1✔
496

497
            if tail_point is None or closest_point is None:
1✔
UNCOV
498
                continue
×
499

500
            # Check minimum shaft length (distance from connection to tail)
501
            shaft_length = math.sqrt(
1✔
502
                (tail_point[0] - closest_point[0]) ** 2
503
                + (tail_point[1] - closest_point[1]) ** 2
504
            )
505
            if shaft_length < min_shaft_length:
1✔
506
                continue
1✔
507

508
            # Track the best shaft (closest connection to arrowhead)
509
            if closest_dist < best_distance:
1✔
510
                best_distance = closest_dist
1✔
511
                best_shaft = drawing
1✔
512
                best_tail = tail_point
1✔
513

514
        if best_shaft is not None and best_tail is not None:
1✔
515
            return (best_shaft, best_tail)
1✔
516
        return None
1✔
517

518
    def _extract_path_points(
1✔
519
        self, items: tuple[tuple, ...] | list[tuple]
520
    ) -> list[tuple[float, float]]:
521
        """Extract all unique points from path items.
522

523
        Args:
524
            items: List of path items (lines, rectangles, curves)
525

526
        Returns:
527
            List of unique (x, y) points from the path
528
        """
529
        points: list[tuple[float, float]] = []
1✔
530
        seen: set[tuple[float, float]] = set()
1✔
531

532
        for item in items:
1✔
533
            item_type = item[0]
1✔
534

535
            if item_type == "l":
1✔
536
                # Line: ('l', (x1, y1), (x2, y2))
537
                p1, p2 = item[1], item[2]
1✔
538
                for p in [p1, p2]:
1✔
539
                    key = (round(p[0], 1), round(p[1], 1))
1✔
540
                    if key not in seen:
1✔
541
                        seen.add(key)
1✔
542
                        points.append((p[0], p[1]))
1✔
543

544
            elif item_type == "re":
1✔
545
                # Rectangle: ('re', (x0, y0, x1, y1), ...)
546
                rect = item[1]
1✔
547
                # Add all four corners
548
                corners = [
1✔
549
                    (rect[0], rect[1]),  # top-left
550
                    (rect[2], rect[1]),  # top-right
551
                    (rect[0], rect[3]),  # bottom-left
552
                    (rect[2], rect[3]),  # bottom-right
553
                ]
554
                for p in corners:
1✔
555
                    key = (round(p[0], 1), round(p[1], 1))
1✔
556
                    if key not in seen:
1✔
557
                        seen.add(key)
1✔
558
                        points.append(p)
1✔
559

560
        return points
1✔
561

562
    def build(self, candidate: Candidate, result: ClassificationResult) -> Arrow:
1✔
563
        """Construct an Arrow element from a candidate."""
564
        score_details = candidate.score_details
1✔
565
        assert isinstance(score_details, _ArrowScore)
1✔
566

567
        # Build ArrowHead instances from head data
568
        heads = [
1✔
569
            ArrowHead(tip=head.tip, direction=head.direction)
570
            for head in score_details.heads
571
        ]
572

573
        return Arrow(
1✔
574
            bbox=candidate.bbox,
575
            heads=heads,
576
            tail=score_details.tail,
577
        )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc