• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19257583787

11 Nov 2025 06:52AM UTC coverage: 91.217% (+0.5%) from 90.748%
19257583787

push

github

bramp
feat(pdf_extract): Update lego_page_layout tool

- Add support for ProgressBar and PartNumber elements.
- Remove NewBag and BagNumber from the example.
- Adjust ProgressBar to be left-aligned with steps and have a margin.
- Regenerate the layout diagram.

4923 of 5397 relevant lines covered (91.22%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.97
/src/build_a_long/pdf_extract/classifier/step_classifier.py
1
"""
2
Step classifier.
3

4
Purpose
5
-------
6
Identify complete Step structures by combining step_number, parts_list, and diagram
7
elements. A Step represents a single building instruction comprising:
8
- A StepNumber label
9
- An optional PartsList (the parts needed for this step)
10
- A Diagram (the main instruction graphic showing what to build)
11

12
We look for step_numbers and attempt to pair them with nearby parts_lists and
13
identify the appropriate diagram region for each step.
14

15
Debugging
16
---------
17
Set environment variables to aid investigation without code changes:
18

19
- LOG_LEVEL=DEBUG
20
    Enables DEBUG-level logging (if not already configured by caller).
21
"""
22

23
import logging
1✔
24
from dataclasses import dataclass
1✔
25

26
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
27
    Candidate,
28
    ClassificationResult,
29
)
30
from build_a_long.pdf_extract.classifier.label_classifier import (
1✔
31
    LabelClassifier,
32
)
33
from build_a_long.pdf_extract.extractor.bbox import BBox
1✔
34
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
35
    Diagram,
36
    PartsList,
37
    Step,
38
    StepNumber,
39
)
40

41
log = logging.getLogger(__name__)
1✔
42

43

44
@dataclass
1✔
45
class _StepScore:
1✔
46
    """Internal score representation for step classification."""
47

48
    step_number: StepNumber
49
    """The step number this step is associated with."""
1✔
50

51
    has_parts_list: bool
52
    """Whether this step has an associated parts list."""
1✔
53

54
    step_proximity_score: float
55
    """Score based on proximity to the PartsList above (0.0-1.0).
1✔
56
    1.0 for closest proximity, 0.0 if very far. 0.0 if no parts list."""
57

58
    step_alignment_score: float
59
    """Score based on left-edge alignment with PartsList above (0.0-1.0).
1✔
60
    1.0 is perfect alignment, 0.0 is very misaligned. 0.0 if no parts list."""
61

62
    diagram_area: float
63
    """Area of the diagram region."""
1✔
64

65
    def pairing_score(self) -> float:
1✔
66
        """Calculate pairing quality score (average of proximity and alignment)."""
67
        if not self.has_parts_list:
1✔
68
            return 0.0
1✔
69
        return (self.step_proximity_score + self.step_alignment_score) / 2.0
1✔
70

71
    def sort_key(self) -> tuple[float, int]:
1✔
72
        """Return a tuple for sorting candidates.
73

74
        We prefer:
75
        1. Higher pairing scores (better StepNumber-PartsList match)
76
        2. Lower step number values (to break ties and maintain order)
77
        """
78
        return (-self.pairing_score(), self.step_number.value)
1✔
79

80

81
@dataclass(frozen=True)
1✔
82
class StepClassifier(LabelClassifier):
1✔
83
    """Classifier for complete Step structures."""
84

85
    outputs = frozenset({"step"})
1✔
86
    requires = frozenset({"step_number", "parts_list"})
1✔
87

88
    def evaluate(self, result: ClassificationResult) -> None:
1✔
89
        """Evaluate elements and create Step candidates for all possible pairings.
90

91
        Creates Step candidates for each StepNumber, scoring all possible pairings
92
        with PartsLists. Candidates are stored in ClassificationResult, and the
93
        best ones will be selected in classify().
94
        """
95
        page_data = result.page_data
1✔
96

97
        # Get winners with type safety
98
        steps = result.get_winners("step_number", StepNumber)
1✔
99

100
        if not steps:
1✔
101
            return
1✔
102

103
        # Get parts_list winners
104
        parts_lists = result.get_winners("parts_list", PartsList)
1✔
105

106
        log.debug(
1✔
107
            "[step] page=%s steps=%d parts_lists=%d",
108
            page_data.page_number,
109
            len(steps),
110
            len(parts_lists),
111
        )
112

113
        # Create Step candidates for all possible pairings
114
        for step_num in steps:
1✔
115
            # Create candidates for this StepNumber paired with each PartsList
116
            for parts_list in parts_lists:
1✔
117
                self._create_step_candidate(step_num, parts_list, result)
1✔
118

119
            # Also create a candidate with no PartsList (fallback)
120
            self._create_step_candidate(step_num, None, result)
1✔
121

122
        log.debug(
1✔
123
            "[step] Created %d step candidates",
124
            len(result.get_candidates("step")),
125
        )
126

127
    def _create_step_candidate(
1✔
128
        self,
129
        step_num: StepNumber,
130
        parts_list: PartsList | None,
131
        result: ClassificationResult,
132
    ) -> None:
133
        """Create a Step candidate for a StepNumber paired with a PartsList (or None).
134

135
        Args:
136
            step_num: The StepNumber for this candidate
137
            parts_list: The PartsList to pair with (or None for no pairing)
138
            result: Classification result to add the candidate to
139
        """
140
        ABOVE_EPS = 2.0  # Small epsilon for "above" check
1✔
141
        ALIGNMENT_THRESHOLD_MULTIPLIER = 1.0  # Max horizontal offset
1✔
142
        DISTANCE_THRESHOLD_MULTIPLIER = 1.0  # Max vertical distance
1✔
143

144
        # Calculate pairing scores if there's a parts_list above the step
145
        proximity_score = 0.0
1✔
146
        alignment_score = 0.0
1✔
147

148
        if (
1✔
149
            parts_list is not None
150
            and parts_list.bbox.y1 <= step_num.bbox.y0 + ABOVE_EPS
151
        ):
152
            # Calculate distance (how far apart vertically)
153
            distance = step_num.bbox.y0 - parts_list.bbox.y1
1✔
154

155
            # Calculate proximity score
156
            max_distance = step_num.bbox.height * DISTANCE_THRESHOLD_MULTIPLIER
1✔
157
            if max_distance > 0:
1✔
158
                proximity_score = max(0.0, 1.0 - (distance / max_distance))
1✔
159

160
            # Calculate alignment score (how well left edges align)
161
            max_alignment_diff = step_num.bbox.width * ALIGNMENT_THRESHOLD_MULTIPLIER
1✔
162
            left_diff = abs(parts_list.bbox.x0 - step_num.bbox.x0)
1✔
163
            if max_alignment_diff > 0:
1✔
164
                alignment_score = max(0.0, 1.0 - (left_diff / max_alignment_diff))
1✔
165

166
        # Identify diagram region
167
        diagram_bbox = self._identify_diagram_region(step_num, parts_list, result)
1✔
168

169
        # Build Step
170
        diagram = Diagram(bbox=diagram_bbox)
1✔
171
        constructed = Step(
1✔
172
            bbox=self._compute_step_bbox(step_num, parts_list, diagram),
173
            step_number=step_num,
174
            parts_list=parts_list or PartsList(bbox=step_num.bbox, parts=[]),
175
            diagram=diagram,
176
        )
177

178
        # Create score
179
        score = _StepScore(
1✔
180
            step_number=step_num,
181
            has_parts_list=parts_list is not None,
182
            step_proximity_score=proximity_score,
183
            step_alignment_score=alignment_score,
184
            diagram_area=diagram_bbox.area,
185
        )
186

187
        # Add candidate (not yet a winner)
188
        step_candidate = Candidate(
1✔
189
            bbox=constructed.bbox,
190
            label="step",
191
            score=score.pairing_score(),
192
            score_details=score,
193
            constructed=constructed,
194
            source_block=None,
195
            failure_reason=None,
196
            is_winner=False,
197
        )
198

199
        result.add_candidate("step", step_candidate)
1✔
200

201
    def _identify_diagram_region(
1✔
202
        self,
203
        step_num: StepNumber,
204
        parts_list: PartsList | None,
205
        result: ClassificationResult,
206
    ) -> BBox:
207
        """Identify the diagram region for a step.
208

209
        The diagram is typically the large area below the step number and parts list.
210
        For now, we create a simple heuristic-based region.
211

212
        Args:
213
            step_num: The step number
214
            parts_list: The associated parts list (if any)
215
            result: Classification result containing page_data
216

217
        Returns:
218
            BBox representing the diagram region
219
        """
220
        page_data = result.page_data
1✔
221
        # Simple heuristic: use the step number's bbox as a starting point
222
        # In the future, we should look for actual drawing elements below the step
223

224
        # Start with step number position
225
        x0 = step_num.bbox.x0
1✔
226
        y0 = step_num.bbox.y1  # Below the step number
1✔
227

228
        # If there's a parts list, the diagram should be below it
229
        if parts_list:
1✔
230
            y0 = max(y0, parts_list.bbox.y1)
1✔
231

232
        # Extend to a reasonable area (placeholder logic)
233
        # TODO: Find actual drawing elements and use their bounds
234
        page_bbox = page_data.bbox
1✔
235
        assert page_bbox is not None
1✔
236

237
        # Use the rest of the page width and height as a simple approximation
238
        x1 = page_bbox.x1
1✔
239
        y1 = page_bbox.y1
1✔
240

241
        # Create a bbox for the diagram region
242
        return BBox(x0=x0, y0=y0, x1=x1, y1=y1)
1✔
243

244
    def _compute_step_bbox(
1✔
245
        self,
246
        step_num: StepNumber,
247
        parts_list: PartsList | None,
248
        diagram: Diagram,
249
    ) -> BBox:
250
        """Compute the overall bounding box for the Step.
251

252
        This encompasses the step number, parts list (if any), and diagram.
253

254
        Args:
255
            step_num: The step number element
256
            parts_list: The parts list (if any)
257
            diagram: The diagram element
258

259
        Returns:
260
            Combined bounding box
261
        """
262
        bboxes = [step_num.bbox, diagram.bbox]
1✔
263
        if parts_list:
1✔
264
            bboxes.append(parts_list.bbox)
1✔
265

266
        return BBox.union_all(bboxes)
1✔
267

268
    def classify(self, result: ClassificationResult) -> None:
1✔
269
        """Greedily select the best Step candidates.
270

271
        Uses the candidates created in evaluate() to select the best pairings.
272
        Ensures each StepNumber value and each PartsList is used at most once.
273
        """
274
        # Get all Step candidates
275
        candidate_list = result.get_candidates("step")
1✔
276

277
        # Sort candidates by score (highest first)
278
        sorted_candidates = sorted(
1✔
279
            candidate_list,
280
            key=lambda c: c.score_details.sort_key(),
281
        )
282

283
        # Track which StepNumber values and PartsLists have been used
284
        used_step_values: set[int] = set()
1✔
285
        used_parts_list_ids: set[int] = set()
1✔
286

287
        # Greedily select winners
288
        for candidate in sorted_candidates:
1✔
289
            if candidate.constructed is None:
1✔
290
                continue
×
291

292
            assert isinstance(candidate.constructed, Step)
1✔
293
            step = candidate.constructed
1✔
294
            step_value = step.step_number.value
1✔
295
            parts_list = step.parts_list if len(step.parts_list.parts) > 0 else None
1✔
296

297
            # Skip if this step number value is already used
298
            if step_value in used_step_values:
1✔
299
                log.debug(
1✔
300
                    "[step] Skipping candidate for step %d - value already used",
301
                    step_value,
302
                )
303
                continue
1✔
304

305
            # Skip if this parts_list is already used (if it has parts)
306
            if parts_list is not None:
1✔
307
                parts_list_id = id(parts_list)
1✔
308
                if parts_list_id in used_parts_list_ids:
1✔
309
                    log.debug(
1✔
310
                        "[step] Skipping candidate for step %d - "
311
                        "PartsList already used",
312
                        step_value,
313
                    )
314
                    continue
1✔
315
                # Claim this parts_list
316
                used_parts_list_ids.add(parts_list_id)
1✔
317

318
            # Mark this candidate as winner
319
            result.mark_winner(candidate, step)
1✔
320
            used_step_values.add(step_value)
1✔
321

322
            log.debug(
1✔
323
                "[step] Marking step %d as winner (parts_list=%s, pairing_score=%.2f)",
324
                step_value,
325
                "yes" if parts_list is not None else "no",
326
                candidate.score_details.pairing_score(),
327
            )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc