• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19060277498

04 Nov 2025 06:46AM UTC coverage: 84.229% (-0.02%) from 84.251%
19060277498

push

github

bramp
Chore: Enabled some more lint checks.

src/build_a_long/downloader/legocom.py # modified:  src/build_a_long/downloader/metadata.py # modified:  src/build_a_long/pdf_extract/analyze_classifier.py # modified:
src/build_a_long/pdf_extract/classifier/classification_result.py # modified:  src/build_a_long/pdf_extract/classifier/classifier.py # modified:
src/build_a_long/pdf_extract/classifier/classifier_golden_test.py # modified:  src/build_a_long/pdf_extract/classifier/classifier_rules_test.py # modified:
src/build_a_long/pdf_extract/classifier/classifier_test.py # modified:  src/build_a_long/pdf_extract/classifier/hierarchy_builder.py # modified:
src/build_a_long/pdf_extract/classifier/hierarchy_builder_test.py # modified:  src/build_a_long/pdf_extract/classifier/label_classifier.py # modified:
src/build_a_long/pdf_extract/classifier/lego_page_builder.py # modified:  src/build_a_long/pdf_extract/classifier/lego_page_builder_test.py # modified:
src/build_a_long/pdf_extract/classifier/page_number_classifier.py # modified:  src/build_a_long/pdf_extract/classifier/part_count_classifier.py # modified:
src/build_a_long/pdf_extract/classifier/parts_image_classifier.py # modified:  src/build_a_long/pdf_extract/classifier/parts_list_classifier.py # modified:
src/build_a_long/pdf_extract/classifier/step_classifier.py # modified:  src/build_a_long/pdf_extract/classifier/step_number_classifier.py # modified:
src/build_a_long/pdf_extract/classifier/text_extractors.py # modified:  src/build_a_long/pdf_extract/extractor/bbox.py # modified:  src/build_a_long/pdf_extract/extractor/extractor.py # modified:
src/build_a_long/pdf_extract/extractor/hierarchy.py # modified:  src/build_a_long/pdf_extract/extractor/lego_page_elements.py # modified:  src/build_a_long/pdf_extract/extractor/page_data_json_test.py #
modified:  src/build_a_long/pdf_extract/extractor/page_elements.py # modified:  src/build_a_long/pdf_extract/extractor/pymupdf_types.py # modified:  src/build_a_long/pdf... (continued)

164 of 175 new or added lines in 28 files covered. (93.71%)

255 existing lines in 12 files now uncovered.

3573 of 4242 relevant lines covered (84.23%)

0.84 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.7
/src/build_a_long/pdf_extract/classifier/step_classifier.py
1
"""
2
Step classifier.
3

4
Purpose
5
-------
6
Identify complete Step structures by combining step_number, parts_list, and diagram
7
elements. A Step represents a single building instruction comprising:
8
- A StepNumber label
9
- An optional PartsList (the parts needed for this step)
10
- A Diagram (the main instruction graphic showing what to build)
11

12
We look for step_numbers and attempt to pair them with nearby parts_lists and
13
identify the appropriate diagram region for each step.
14

15
Debugging
16
---------
17
Set environment variables to aid investigation without code changes:
18

19
- LOG_LEVEL=DEBUG
20
    Enables DEBUG-level logging (if not already configured by caller).
21

22
- CLASSIFIER_DEBUG=step (or "all")
23
    Enables more verbose, structured logs in this classifier, including
24
    candidate enumeration and rejection reasons.
25
"""
26

27
import logging
1✔
28
import os
1✔
29
from collections.abc import Sequence
1✔
30
from dataclasses import dataclass
1✔
31

32
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
33
    Candidate,
34
    ClassificationHints,
35
    ClassificationResult,
36
    ClassifierConfig,
37
)
38
from build_a_long.pdf_extract.classifier.label_classifier import (
1✔
39
    LabelClassifier,
40
)
41
from build_a_long.pdf_extract.extractor import PageData
1✔
42
from build_a_long.pdf_extract.extractor.bbox import BBox
1✔
43
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
44
    Diagram,
45
    PartsList,
46
    Step,
47
    StepNumber,
48
)
49
from build_a_long.pdf_extract.extractor.page_elements import (
1✔
50
    Drawing,
51
)
52

53
log = logging.getLogger(__name__)
1✔
54

55

56
@dataclass
1✔
57
class _StepScore:
1✔
58
    """Internal score representation for step classification."""
59

60
    step_number: StepNumber
61
    """The step number this step is associated with."""
1✔
62

63
    has_parts_list: bool
64
    """Whether this step has an associated parts list."""
1✔
65

66
    diagram_area: float
67
    """Area of the diagram region."""
1✔
68

69
    def sort_key(self) -> tuple[int, int, float]:
1✔
70
        """Return a tuple for sorting candidates.
71

72
        We prefer:
73
        1. Lower step number values (to maintain order)
74
        2. Steps with parts lists over those without
75
        3. Larger diagram areas (more content)
76
        """
77
        return (self.step_number.value, -int(self.has_parts_list), -self.diagram_area)
1✔
78

79

80
class StepClassifier(LabelClassifier):
1✔
81
    """Classifier for complete Step structures."""
82

83
    outputs = {"step"}
1✔
84
    requires = {"step_number", "parts_list"}
1✔
85

86
    def __init__(self, config: ClassifierConfig, classifier):
1✔
87
        super().__init__(config, classifier)
1✔
88
        self._debug_enabled = os.getenv("CLASSIFIER_DEBUG", "").lower() in (
1✔
89
            "step",
90
            "all",
91
        )
92

93
    def evaluate(
1✔
94
        self,
95
        page_data: PageData,
96
        result: ClassificationResult,
97
    ) -> None:
98
        """Evaluate elements and create candidates for complete Step structures.
99

100
        Combines step_number and parts_list elements, identifies diagram regions,
101
        and creates Step candidates.
102
        """
103

104
        # Get step_number candidates and their constructed StepNumber elements
105
        step_candidates = result.get_candidates("step_number")
1✔
106
        steps: list[StepNumber] = []
1✔
107

108
        for candidate in step_candidates:
1✔
109
            if (
1✔
110
                candidate.is_winner
111
                and candidate.constructed is not None
112
                and isinstance(candidate.constructed, StepNumber)
113
            ):
114
                steps.append(candidate.constructed)
1✔
115

116
        if not steps:
1✔
117
            return
1✔
118

119
        # Get parts_list candidates and their constructed PartsList elements
120
        parts_list_candidates = result.get_candidates("parts_list")
1✔
121
        parts_lists: list[PartsList] = []
1✔
122

123
        for candidate in parts_list_candidates:
1✔
124
            if (
1✔
125
                candidate.is_winner
126
                and candidate.constructed is not None
127
                and isinstance(candidate.constructed, PartsList)
128
            ):
129
                parts_lists.append(candidate.constructed)
1✔
130

131
        drawings: list[Drawing] = [
1✔
132
            e for e in page_data.elements if isinstance(e, Drawing)
133
        ]
134

135
        if self._debug_enabled:
1✔
136
            log.debug(
×
137
                "[step] page=%s elements=%d steps=%d parts_lists=%d drawings=%d",
138
                page_data.page_number,
139
                len(page_data.elements),
140
                len(steps),
141
                len(parts_lists),
142
                len(drawings),
143
            )
144

145
        # Build a Step for each step_number
146
        for step_num in steps:
1✔
147
            # Find the parts_list associated with this step (if any)
148
            # Parts lists are typically above the step number
149
            associated_parts_list = self._find_associated_parts_list(
1✔
150
                step_num, parts_lists
151
            )
152

153
            # Identify the diagram region for this step
154
            diagram_bbox = self._identify_diagram_region(
1✔
155
                step_num, associated_parts_list, page_data
156
            )
157

158
            # Create the Step score
159
            score = _StepScore(
1✔
160
                step_number=step_num,
161
                has_parts_list=associated_parts_list is not None,
162
                diagram_area=diagram_bbox.area,
163
            )
164

165
            # Build the Diagram element
166
            diagram = Diagram(
1✔
167
                bbox=diagram_bbox,
168
            )
169

170
            # Build the Step
171
            constructed = Step(
1✔
172
                bbox=self._compute_step_bbox(step_num, associated_parts_list, diagram),
173
                step_number=step_num,
174
                parts_list=associated_parts_list
175
                or PartsList(bbox=step_num.bbox, parts=[]),
176
                diagram=diagram,
177
            )
178
            # Add candidate - Note: Step is a synthetic element combining
179
            # step_number, parts_list, and diagram, so source_element is None
180
            result.add_candidate(
1✔
181
                "step",
182
                Candidate(
183
                    bbox=constructed.bbox,
184
                    label="step",
185
                    score=1.0,  # Step uses ranking rather than scores
186
                    score_details=score,
187
                    constructed=constructed,
188
                    source_element=None,  # Synthetic element has no single source
189
                    failure_reason=None,
190
                    is_winner=False,  # Will be set by classify()
191
                ),
192
            )
193

194
    def _find_associated_parts_list(
1✔
195
        self, step_num: StepNumber, parts_lists: Sequence[PartsList]
196
    ) -> PartsList | None:
197
        """Find the parts list associated with a step number.
198

199
        The parts list is typically above the step number. We look for parts lists
200
        that are above the step and choose the closest one.
201

202
        Args:
203
            step_num: The step number to find a parts list for
204
            parts_lists: List of all parts lists on the page
205

206
        Returns:
207
            The associated PartsList or None if no suitable parts list is found
208
        """
209
        candidates = []
1✔
210

211
        for parts_list in parts_lists:
1✔
212
            # Check if parts list is above the step number
213
            if self._is_parts_list_above_step(parts_list, step_num):
1✔
214
                # Calculate distance
215
                distance = step_num.bbox.y0 - parts_list.bbox.y1
1✔
216
                candidates.append((distance, parts_list))
1✔
217

218
        if not candidates:
1✔
219
            return None
1✔
220

221
        # Return the closest parts list
222
        candidates.sort(key=lambda x: x[0])
1✔
223
        return candidates[0][1]
1✔
224

225
    def _is_parts_list_above_step(
1✔
226
        self, parts_list: PartsList, step_num: StepNumber
227
    ) -> bool:
228
        """Check if a parts list is spatially above a step number.
229

230
        Args:
231
            parts_list: The parts list element
232
            step_num: The step number element
233

234
        Returns:
235
            True if the parts list is above the step number
236
        """
237
        ABOVE_EPS = 2.0
1✔
238
        return parts_list.bbox.y1 <= step_num.bbox.y0 + ABOVE_EPS
1✔
239

240
    def _identify_diagram_region(
1✔
241
        self,
242
        step_num: StepNumber,
243
        parts_list: PartsList | None,
244
        page_data: PageData,
245
    ) -> BBox:
246
        """Identify the diagram region for a step.
247

248
        The diagram is typically the large area below the step number and parts list.
249
        For now, we create a simple heuristic-based region.
250

251
        Args:
252
            step_num: The step number
253
            parts_list: The associated parts list (if any)
254
            page_data: The page data
255

256
        Returns:
257
            BBox representing the diagram region
258
        """
259
        # Simple heuristic: use the step number's bbox as a starting point
260
        # In the future, we should look for actual drawing elements below the step
261

262
        # Start with step number position
263
        x0 = step_num.bbox.x0
1✔
264
        y0 = step_num.bbox.y1  # Below the step number
1✔
265

266
        # If there's a parts list, the diagram should be below it
267
        if parts_list:
1✔
268
            y0 = max(y0, parts_list.bbox.y1)
1✔
269

270
        # Extend to a reasonable area (placeholder logic)
271
        # TODO: Find actual drawing elements and use their bounds
272
        page_bbox = page_data.bbox
1✔
273
        assert page_bbox is not None
1✔
274

275
        # Use the rest of the page width and height as a simple approximation
276
        x1 = page_bbox.x1
1✔
277
        y1 = page_bbox.y1
1✔
278

279
        # Create a bbox for the diagram region
280
        return BBox(x0=x0, y0=y0, x1=x1, y1=y1)
1✔
281

282
    # TODO This seems a useful union function for the bbox element.
283
    def _compute_step_bbox(
1✔
284
        self,
285
        step_num: StepNumber,
286
        parts_list: PartsList | None,
287
        diagram: Diagram,
288
    ) -> BBox:
289
        """Compute the overall bounding box for the Step.
290

291
        This encompasses the step number, parts list (if any), and diagram.
292

293
        Args:
294
            step_num: The step number element
295
            parts_list: The parts list (if any)
296
            diagram: The diagram element
297

298
        Returns:
299
            Combined bounding box
300
        """
301
        bboxes = [step_num.bbox, diagram.bbox]
1✔
302
        if parts_list:
1✔
303
            bboxes.append(parts_list.bbox)
1✔
304

305
        x0 = min(b.x0 for b in bboxes)
1✔
306
        y0 = min(b.y0 for b in bboxes)
1✔
307
        x1 = max(b.x1 for b in bboxes)
1✔
308
        y1 = max(b.y1 for b in bboxes)
1✔
309

310
        return BBox(x0=x0, y0=y0, x1=x1, y1=y1)
1✔
311

312
    def classify(
1✔
313
        self,
314
        page_data: PageData,
315
        result: ClassificationResult,
316
        hints: ClassificationHints | None,
317
    ) -> None:
318
        """Classify Step candidates and mark winners.
319

320
        Args:
321
            page_data: The page data
322
            result: Classification result to update
323
            hints: Optional hints (unused)
324
        """
325
        # Get pre-built candidates
326
        candidate_list = result.get_candidates("step")
1✔
327

328
        # Sort the candidates based on our scoring criteria
329
        sorted_candidates = sorted(
1✔
330
            candidate_list,
331
            key=lambda c: c.score_details.sort_key(),
332
        )
333

334
        # Mark winners (all successfully constructed candidates)
335
        for candidate in sorted_candidates:
1✔
336
            if candidate.constructed is None:
1✔
337
                # Already has failure_reason from evaluate
UNCOV
338
                continue
×
339

340
            assert isinstance(candidate.constructed, Step)
1✔
341

342
            # Step is synthetic and has no source_element, so no removal check needed
343
            # (there's no underlying element that could be removed by other classifiers)
344

345
            # This is a winner!
346
            result.mark_winner(
1✔
347
                candidate, candidate.source_element, candidate.constructed
348
            )
349

350
            # No need to remove overlapping elements since Step is synthetic
351
            # and doesn't consume any underlying elements
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc