• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19995046189

06 Dec 2025 10:18PM UTC coverage: 90.506% (+0.09%) from 90.421%
19995046189

push

github

bramp
test: regenerate golden files for step classifier refactoring

10525 of 11629 relevant lines covered (90.51%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.87
/src/build_a_long/pdf_extract/classifier/steps/diagram_classifier.py
1
"""
2
Diagram classifier.
3

4
Purpose
5
-------
6
Identify diagram regions on instruction pages. Diagrams are any images or
7
drawings on the page, distinct from PartImages (which are single LEGO pieces).
8
Sometimes a diagram is split into multiple smaller images that are positioned
9
next to each other, so we cluster them together.
10

11
Heuristic
12
---------
13
- Look for Image elements on the page
14
- Filter out full-page images (> 90% of page area, likely backgrounds or borders)
15
- Each Image becomes a diagram candidate (no clustering during scoring)
16
- During build(), expand to include adjacent unclaimed images (lazy clustering)
17

18
Lazy Clustering
19
---------------
20
Clustering is deferred to build() time. This allows other classifiers (like
21
SubAssemblyClassifier) to claim images first. When build() is called:
22
1. Start with the candidate's source image
23
2. Find all adjacent/overlapping unclaimed images
24
3. Cluster them together into a single Diagram
25
4. Mark all clustered images as consumed
26

27
Re-scoring
28
----------
29
When a diagram's source blocks conflict with another candidate (e.g., an arrow
30
that claims part of the diagram), the diagram can be re-scored without those
31
blocks. If the remaining blocks still form a valid diagram (meets minimum area),
32
a reduced candidate is created instead of failing entirely.
33

34
Debugging
35
---------
36
Enable with `LOG_LEVEL=DEBUG` for structured logs.
37
"""
38

39
from __future__ import annotations
1✔
40

41
import logging
1✔
42
from typing import ClassVar
1✔
43

44
from build_a_long.pdf_extract.classifier.candidate import Candidate
1✔
45
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
46
    ClassificationResult,
47
)
48
from build_a_long.pdf_extract.classifier.label_classifier import (
1✔
49
    LabelClassifier,
50
)
51
from build_a_long.pdf_extract.classifier.score import Score, Weight
1✔
52
from build_a_long.pdf_extract.extractor.bbox import (
1✔
53
    BBox,
54
    build_all_connected_clusters,
55
)
56
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
57
    Diagram,
58
)
59
from build_a_long.pdf_extract.extractor.page_blocks import (
1✔
60
    Image,
61
)
62

63
log = logging.getLogger(__name__)
1✔
64

65

66
class _DiagramScore(Score):
1✔
67
    """Internal score representation for diagram classification."""
68

69
    cluster_bbox: BBox
1✔
70
    """Bounding box encompassing the entire diagram cluster."""
1✔
71

72
    num_images: int
1✔
73
    """Number of images/drawings in this cluster."""
1✔
74

75
    def score(self) -> Weight:
1✔
76
        """Calculate final weighted score from components."""
77
        # All diagram clusters get score of 1.0
78
        # Filtering happens in _score() method
79
        return 1.0
1✔
80

81

82
class DiagramClassifier(LabelClassifier):
1✔
83
    """Classifier for diagram regions on instruction pages."""
84

85
    output = "diagram"
1✔
86
    requires = frozenset(
1✔
87
        {
88
            # Arrows typically overlap diagrams - so we exclude them upfront
89
            "arrow",
90
        }
91
    )
92

93
    # TODO Convert to configurable parameters
94
    # Area filtering threshold (as ratio of page area)
95
    MAX_AREA_RATIO: ClassVar[float] = (
1✔
96
        0.95  # Filter out images > 95% of page (backgrounds/borders)
97
    )
98

99
    def _score(self, result: ClassificationResult) -> None:
1✔
100
        """Score Image blocks and create one candidate per image.
101

102
        Clustering is deferred to build() time to allow other classifiers
103
        to claim images first.
104
        """
105
        page_data = result.page_data
1✔
106
        page_bbox = page_data.bbox
1✔
107
        assert page_bbox is not None
1✔
108

109
        arrow_candidates = result.get_scored_candidates(
1✔
110
            "arrow", valid_only=False, exclude_failed=True
111
        )
112

113
        # Get all image blocks, filtering out full-page images
114
        image_blocks: list[Image] = []
1✔
115
        for block in page_data.blocks:
1✔
116
            # Only consider Image elements
117
            if not isinstance(block, Image):
1✔
118
                continue
1✔
119

120
            # Skip if part of an arrow's source blocks
121
            if any(block in arrow.source_blocks for arrow in arrow_candidates):
1✔
122
                continue
×
123

124
            # Filter based on area relative to page
125
            area_ratio = block.bbox.area / page_bbox.area
1✔
126

127
            # Skip full-page images (> 95% of page area)
128
            # These are likely borders/backgrounds
129
            # TODO This may not be necessary as we filter out all background
130
            # blocks a lot earlier
131
            if area_ratio > self.MAX_AREA_RATIO:
1✔
132
                continue
1✔
133

134
            image_blocks.append(block)
1✔
135

136
        if not image_blocks:
1✔
137
            log.debug(
1✔
138
                "[diagram] No image blocks found on page %s",
139
                page_data.page_number,
140
            )
141
            return
1✔
142

143
        log.debug(
1✔
144
            "[diagram] page=%s image_blocks=%d",
145
            page_data.page_number,
146
            len(image_blocks),
147
        )
148

149
        # Create one candidate per image (no clustering during scoring)
150
        for block in image_blocks:
1✔
151
            score_details = _DiagramScore(
1✔
152
                cluster_bbox=block.bbox,
153
                num_images=1,
154
            )
155

156
            candidate = Candidate(
1✔
157
                bbox=block.bbox,
158
                label="diagram",
159
                score=score_details.score(),
160
                score_details=score_details,
161
                source_blocks=[block],
162
            )
163
            result.add_candidate(candidate)
1✔
164

165
    # TODO Can we delete this
166
    def rescore_without_blocks(
1✔
167
        self,
168
        candidate: Candidate,
169
        excluded_block_ids: set[int],
170
        result: ClassificationResult,
171
    ) -> Candidate | None:
172
        """Create a new diagram candidate excluding specified blocks.
173

174
        Since each candidate now represents a single image, if that image
175
        is excluded, the candidate is no longer valid.
176

177
        Args:
178
            candidate: The original candidate to re-score
179
            excluded_block_ids: Set of block IDs to exclude
180
            result: The classification result context
181

182
        Returns:
183
            The same candidate if the image is not excluded, None otherwise.
184
        """
185
        # With single-image candidates, if the block is excluded, return None
186
        if (
1✔
187
            candidate.source_blocks
188
            and candidate.source_blocks[0].id in excluded_block_ids
189
        ):
190
            return None
1✔
191
        return candidate
×
192

193
    def build(self, candidate: Candidate, result: ClassificationResult) -> Diagram:
1✔
194
        """Construct a Diagram element with lazy clustering.
195

196
        Starting from the candidate's source image, expands to include all
197
        adjacent/overlapping unclaimed images, clustering them into a single
198
        Diagram.
199
        """
200
        page_bbox = result.page_data.bbox
1✔
201
        assert page_bbox is not None
1✔
202

203
        # Start with the candidate's source block
204
        assert len(candidate.source_blocks) == 1
1✔
205
        seed_block = candidate.source_blocks[0]
1✔
206
        assert isinstance(seed_block, Image)
1✔
207

208
        # Find all unclaimed images that can be clustered with this one
209
        clustered_blocks = self._expand_cluster(seed_block, result)
1✔
210

211
        # Calculate the combined bbox
212
        cluster_bbox = BBox.union_all([b.bbox for b in clustered_blocks])
1✔
213

214
        # Clip diagram bbox to page bounds
215
        diagram_bbox = cluster_bbox.clip_to(page_bbox)
1✔
216

217
        # Update the candidate's source_blocks to include all clustered blocks
218
        # This ensures they all get marked as consumed
219
        candidate.source_blocks = list(clustered_blocks)
1✔
220

221
        log.debug(
1✔
222
            "[diagram] Building diagram at %s (clustered %d images)",
223
            diagram_bbox,
224
            len(clustered_blocks),
225
        )
226

227
        return Diagram(bbox=diagram_bbox)
1✔
228

229
    def _expand_cluster(
1✔
230
        self, seed_block: Image, result: ClassificationResult
231
    ) -> list[Image]:
232
        """Expand from a seed image to include all adjacent unclaimed images.
233

234
        Uses flood-fill to find all images that are adjacent/overlapping
235
        and not yet consumed by another classifier.
236

237
        Args:
238
            seed_block: The starting image block
239
            result: Classification result to check consumed blocks
240

241
        Returns:
242
            List of all images in the cluster (including seed)
243
        """
244
        # Get all unclaimed image blocks on the page
245
        log.debug(
1✔
246
            "[diagram] _expand_cluster: seed=%d at %s, consumed_blocks=%s",
247
            seed_block.id,
248
            seed_block.bbox,
249
            sorted(result._consumed_blocks),
250
        )
251
        unclaimed_images: list[Image] = []
1✔
252
        for block in result.page_data.blocks:
1✔
253
            if not isinstance(block, Image):
1✔
254
                continue
1✔
255
            # Skip if already consumed
256
            if block.id in result._consumed_blocks:
1✔
257
                log.debug(
1✔
258
                    "[diagram] Skipping consumed image id=%d at %s",
259
                    block.id,
260
                    block.bbox,
261
                )
262
                continue
1✔
263
            unclaimed_images.append(block)
1✔
264

265
        if seed_block not in unclaimed_images:
1✔
266
            # Seed was already consumed (shouldn't happen, but be safe)
267
            return [seed_block]
×
268

269
        # Build clusters from unclaimed images
270
        clusters = build_all_connected_clusters(unclaimed_images)
1✔
271

272
        # Find the cluster containing our seed block
273
        for cluster in clusters:
1✔
274
            if seed_block in cluster:
1✔
275
                return list(cluster)
1✔
276

277
        # Fallback: just return the seed block
278
        return [seed_block]
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc