• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19058602748

04 Nov 2025 04:38AM UTC coverage: 88.027% (-0.09%) from 88.118%
19058602748

push

github

bramp
Add Step classifier and refactor Candidate to support synthetic elements

- Implement StepClassifier that combines step_number, parts_list, and diagram into Step structures
- Refactor Candidate class to make source_element optional and add required bbox field
- Step is a synthetic/composite element with no single source, so source_element is None
- Update all classifiers to pass bbox explicitly when creating Candidates
- Add comprehensive tests for StepClassifier covering various scenarios
- Update mark_winner and helper methods to handle optional source_element
- Clean up unused step_elements dictionary from StepClassifier
- All tests passing (10/10 classifier test suites)

179 of 181 new or added lines in 5 files covered. (98.9%)

111 existing lines in 10 files now uncovered.

3529 of 4009 relevant lines covered (88.03%)

0.88 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.7
/src/build_a_long/pdf_extract/classifier/step_classifier.py
1
"""
2
Step classifier.
3

4
Purpose
5
-------
6
Identify complete Step structures by combining step_number, parts_list, and diagram
7
elements. A Step represents a single building instruction comprising:
8
- A StepNumber label
9
- An optional PartsList (the parts needed for this step)
10
- A Diagram (the main instruction graphic showing what to build)
11

12
We look for step_numbers and attempt to pair them with nearby parts_lists and
13
identify the appropriate diagram region for each step.
14

15
Debugging
16
---------
17
Set environment variables to aid investigation without code changes:
18

19
- LOG_LEVEL=DEBUG
20
    Enables DEBUG-level logging (if not already configured by caller).
21

22
- CLASSIFIER_DEBUG=step (or "all")
23
    Enables more verbose, structured logs in this classifier, including
24
    candidate enumeration and rejection reasons.
25
"""
26

27
import logging
1✔
28
import os
1✔
29
from dataclasses import dataclass
1✔
30
from typing import Optional, Sequence
1✔
31

32
from build_a_long.pdf_extract.classifier.label_classifier import (
1✔
33
    LabelClassifier,
34
)
35
from build_a_long.pdf_extract.classifier.types import (
1✔
36
    Candidate,
37
    ClassificationHints,
38
    ClassificationResult,
39
    ClassifierConfig,
40
)
41
from build_a_long.pdf_extract.extractor import PageData
1✔
42
from build_a_long.pdf_extract.extractor.bbox import BBox
1✔
43
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
44
    Diagram,
45
    PartsList,
46
    Step,
47
    StepNumber,
48
)
49
from build_a_long.pdf_extract.extractor.page_elements import (
1✔
50
    Drawing,
51
)
52

53
log = logging.getLogger(__name__)
1✔
54

55

56
@dataclass
1✔
57
class _StepScore:
1✔
58
    """Internal score representation for step classification."""
59

60
    step_number: StepNumber
61
    """The step number this step is associated with."""
1✔
62

63
    has_parts_list: bool
64
    """Whether this step has an associated parts list."""
1✔
65

66
    diagram_area: float
67
    """Area of the diagram region."""
1✔
68

69
    def sort_key(self) -> tuple[int, int, float]:
1✔
70
        """Return a tuple for sorting candidates.
71

72
        We prefer:
73
        1. Lower step number values (to maintain order)
74
        2. Steps with parts lists over those without
75
        3. Larger diagram areas (more content)
76
        """
77
        return (self.step_number.value, -int(self.has_parts_list), -self.diagram_area)
1✔
78

79

80
class StepClassifier(LabelClassifier):
1✔
81
    """Classifier for complete Step structures."""
82

83
    outputs = {"step"}
1✔
84
    requires = {"step_number", "parts_list"}
1✔
85

86
    def __init__(self, config: ClassifierConfig, classifier):
1✔
87
        super().__init__(config, classifier)
1✔
88
        self._debug_enabled = os.getenv("CLASSIFIER_DEBUG", "").lower() in (
1✔
89
            "step",
90
            "all",
91
        )
92

93
    def evaluate(
1✔
94
        self,
95
        page_data: PageData,
96
        result: ClassificationResult,
97
    ) -> None:
98
        """Evaluate elements and create candidates for complete Step structures.
99

100
        Combines step_number and parts_list elements, identifies diagram regions,
101
        and creates Step candidates.
102
        """
103

104
        # Get step_number candidates and their constructed StepNumber elements
105
        step_candidates = result.get_candidates("step_number")
1✔
106
        steps: list[StepNumber] = []
1✔
107

108
        for candidate in step_candidates:
1✔
109
            if (
1✔
110
                candidate.is_winner
111
                and candidate.constructed is not None
112
                and isinstance(candidate.constructed, StepNumber)
113
            ):
114
                steps.append(candidate.constructed)
1✔
115

116
        if not steps:
1✔
117
            return
1✔
118

119
        # Get parts_list candidates and their constructed PartsList elements
120
        parts_list_candidates = result.get_candidates("parts_list")
1✔
121
        parts_lists: list[PartsList] = []
1✔
122

123
        for candidate in parts_list_candidates:
1✔
124
            if (
1✔
125
                candidate.is_winner
126
                and candidate.constructed is not None
127
                and isinstance(candidate.constructed, PartsList)
128
            ):
129
                parts_lists.append(candidate.constructed)
1✔
130

131
        drawings: list[Drawing] = [
1✔
132
            e for e in page_data.elements if isinstance(e, Drawing)
133
        ]
134

135
        if self._debug_enabled:
1✔
NEW
136
            log.debug(
×
137
                "[step] page=%s elements=%d steps=%d parts_lists=%d drawings=%d",
138
                page_data.page_number,
139
                len(page_data.elements),
140
                len(steps),
141
                len(parts_lists),
142
                len(drawings),
143
            )
144

145
        # Build a Step for each step_number
146
        for step_num in steps:
1✔
147
            # Find the parts_list associated with this step (if any)
148
            # Parts lists are typically above the step number
149
            associated_parts_list = self._find_associated_parts_list(
1✔
150
                step_num, parts_lists
151
            )
152

153
            # Identify the diagram region for this step
154
            diagram_bbox = self._identify_diagram_region(
1✔
155
                step_num, associated_parts_list, page_data
156
            )
157

158
            # Create the Step score
159
            score = _StepScore(
1✔
160
                step_number=step_num,
161
                has_parts_list=associated_parts_list is not None,
162
                diagram_area=diagram_bbox.area,
163
            )
164

165
            # Build the Diagram element
166
            diagram = Diagram(
1✔
167
                bbox=diagram_bbox,
168
                id=None,  # Diagram is a synthetic region, not from a single source
169
            )
170

171
            # Build the Step
172
            constructed = Step(
1✔
173
                bbox=self._compute_step_bbox(step_num, associated_parts_list, diagram),
174
                step_number=step_num,
175
                parts_list=associated_parts_list
176
                or PartsList(bbox=step_num.bbox, parts=[], id=None),
177
                diagram=diagram,
178
                id=step_num.id,  # Use step number's id as the Step's id
179
            )
180

181
            # Add candidate - Note: Step is a synthetic element combining
182
            # step_number, parts_list, and diagram, so source_element is None
183
            result.add_candidate(
1✔
184
                "step",
185
                Candidate(
186
                    bbox=constructed.bbox,
187
                    label="step",
188
                    score=1.0,  # Step uses ranking rather than scores
189
                    score_details=score,
190
                    constructed=constructed,
191
                    source_element=None,  # Synthetic element has no single source
192
                    failure_reason=None,
193
                    is_winner=False,  # Will be set by classify()
194
                ),
195
            )
196

197
    def _find_associated_parts_list(
1✔
198
        self, step_num: StepNumber, parts_lists: Sequence[PartsList]
199
    ) -> Optional[PartsList]:
200
        """Find the parts list associated with a step number.
201

202
        The parts list is typically above the step number. We look for parts lists
203
        that are above the step and choose the closest one.
204

205
        Args:
206
            step_num: The step number to find a parts list for
207
            parts_lists: List of all parts lists on the page
208

209
        Returns:
210
            The associated PartsList or None if no suitable parts list is found
211
        """
212
        candidates = []
1✔
213

214
        for parts_list in parts_lists:
1✔
215
            # Check if parts list is above the step number
216
            if self._is_parts_list_above_step(parts_list, step_num):
1✔
217
                # Calculate distance
218
                distance = step_num.bbox.y0 - parts_list.bbox.y1
1✔
219
                candidates.append((distance, parts_list))
1✔
220

221
        if not candidates:
1✔
222
            return None
1✔
223

224
        # Return the closest parts list
225
        candidates.sort(key=lambda x: x[0])
1✔
226
        return candidates[0][1]
1✔
227

228
    def _is_parts_list_above_step(
1✔
229
        self, parts_list: PartsList, step_num: StepNumber
230
    ) -> bool:
231
        """Check if a parts list is spatially above a step number.
232

233
        Args:
234
            parts_list: The parts list element
235
            step_num: The step number element
236

237
        Returns:
238
            True if the parts list is above the step number
239
        """
240
        ABOVE_EPS = 2.0
1✔
241
        return parts_list.bbox.y1 <= step_num.bbox.y0 + ABOVE_EPS
1✔
242

243
    def _identify_diagram_region(
1✔
244
        self,
245
        step_num: StepNumber,
246
        parts_list: Optional[PartsList],
247
        page_data: PageData,
248
    ) -> BBox:
249
        """Identify the diagram region for a step.
250

251
        The diagram is typically the large area below the step number and parts list.
252
        For now, we create a simple heuristic-based region.
253

254
        Args:
255
            step_num: The step number
256
            parts_list: The associated parts list (if any)
257
            page_data: The page data
258

259
        Returns:
260
            BBox representing the diagram region
261
        """
262
        # Simple heuristic: use the step number's bbox as a starting point
263
        # In the future, we should look for actual drawing elements below the step
264

265
        # Start with step number position
266
        x0 = step_num.bbox.x0
1✔
267
        y0 = step_num.bbox.y1  # Below the step number
1✔
268

269
        # If there's a parts list, the diagram should be below it
270
        if parts_list:
1✔
271
            y0 = max(y0, parts_list.bbox.y1)
1✔
272

273
        # Extend to a reasonable area (placeholder logic)
274
        # TODO: Find actual drawing elements and use their bounds
275
        page_bbox = page_data.bbox
1✔
276
        assert page_bbox is not None
1✔
277

278
        # Use the rest of the page width and height as a simple approximation
279
        x1 = page_bbox.x1
1✔
280
        y1 = page_bbox.y1
1✔
281

282
        # Create a bbox for the diagram region
283
        return BBox(x0=x0, y0=y0, x1=x1, y1=y1)
1✔
284

285
    # TODO This seems a useful union function for the bbox element.
286
    def _compute_step_bbox(
1✔
287
        self,
288
        step_num: StepNumber,
289
        parts_list: Optional[PartsList],
290
        diagram: Diagram,
291
    ) -> BBox:
292
        """Compute the overall bounding box for the Step.
293

294
        This encompasses the step number, parts list (if any), and diagram.
295

296
        Args:
297
            step_num: The step number element
298
            parts_list: The parts list (if any)
299
            diagram: The diagram element
300

301
        Returns:
302
            Combined bounding box
303
        """
304
        bboxes = [step_num.bbox, diagram.bbox]
1✔
305
        if parts_list:
1✔
306
            bboxes.append(parts_list.bbox)
1✔
307

308
        x0 = min(b.x0 for b in bboxes)
1✔
309
        y0 = min(b.y0 for b in bboxes)
1✔
310
        x1 = max(b.x1 for b in bboxes)
1✔
311
        y1 = max(b.y1 for b in bboxes)
1✔
312

313
        return BBox(x0=x0, y0=y0, x1=x1, y1=y1)
1✔
314

315
    def classify(
1✔
316
        self,
317
        page_data: PageData,
318
        result: ClassificationResult,
319
        hints: Optional[ClassificationHints],
320
    ) -> None:
321
        """Classify Step candidates and mark winners.
322

323
        Args:
324
            page_data: The page data
325
            result: Classification result to update
326
            hints: Optional hints (unused)
327
        """
328
        # Get pre-built candidates
329
        candidate_list = result.get_candidates("step")
1✔
330

331
        # Sort the candidates based on our scoring criteria
332
        sorted_candidates = sorted(
1✔
333
            candidate_list,
334
            key=lambda c: c.score_details.sort_key(),
335
        )
336

337
        # Mark winners (all successfully constructed candidates)
338
        for candidate in sorted_candidates:
1✔
339
            if candidate.constructed is None:
1✔
340
                # Already has failure_reason from evaluate
NEW
341
                continue
×
342

343
            assert isinstance(candidate.constructed, Step)
1✔
344

345
            # Step is synthetic and has no source_element, so no removal check needed
346
            # (there's no underlying element that could be removed by other classifiers)
347

348
            # This is a winner!
349
            result.mark_winner(
1✔
350
                candidate, candidate.source_element, candidate.constructed
351
            )
352

353
            # No need to remove overlapping elements since Step is synthetic
354
            # and doesn't consume any underlying elements
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc