• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19724286935

27 Nov 2025 03:27AM UTC coverage: 89.068% (-2.2%) from 91.307%
19724286935

push

github

bramp
test(pdf_extract:classifier): Refactor tests to use fixtures and remove integration test

This commit refactors the classifier test suite to improve maintainability and isolation.

Key changes:
- **Fixtures:** Introduced `conftest.py` with `classifier` and `candidate_factory` fixtures to streamline test setup and candidate creation.
- **Unit Test Conversion:** Updated `parts_classifier_test.py`, `step_classifier_test.py`, `page_number_classifier_test.py`, `part_count_classifier_test.py`, and `step_number_classifier_test.py` to use these fixtures, removing dependency on `classify_elements` and making them true unit tests.
- **Integration Test Removal:** Deleted `test_font_size_integration.py` (and its temporary rename `font_size_scoring_test.py`) as its logic has been moved into the respective classifier unit tests.
- **Cleanup:** Removed debug prints and ensured strict type checking compliance.

382 of 389 new or added lines in 9 files covered. (98.2%)

291 existing lines in 28 files now uncovered.

7585 of 8516 relevant lines covered (89.07%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.76
/src/build_a_long/pdf_extract/classifier/step_classifier.py
1
"""
2
Step classifier.
3

4
Purpose
5
-------
6
Identify complete Step structures by combining step_number, parts_list, and diagram
7
elements. A Step represents a single building instruction comprising:
8
- A StepNumber label
9
- An optional PartsList (the parts needed for this step)
10
- A Diagram (the main instruction graphic showing what to build)
11

12
We look for step_numbers and attempt to pair them with nearby parts_lists and
13
identify the appropriate diagram region for each step.
14

15
Debugging
16
---------
17
Set environment variables to aid investigation without code changes:
18

19
- LOG_LEVEL=DEBUG
20
    Enables DEBUG-level logging (if not already configured by caller).
21
"""
22

23
import logging
1✔
24
from dataclasses import dataclass
1✔
25

26
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
27
    Candidate,
28
    ClassificationResult,
29
)
30
from build_a_long.pdf_extract.classifier.label_classifier import (
1✔
31
    LabelClassifier,
32
)
33
from build_a_long.pdf_extract.classifier.text_extractors import (
1✔
34
    extract_step_number_value,
35
)
36
from build_a_long.pdf_extract.extractor.bbox import BBox
1✔
37
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
38
    Diagram,
39
    LegoPageElements,
40
    PartsList,
41
    Step,
42
    StepNumber,
43
)
44
from build_a_long.pdf_extract.extractor.page_blocks import Text
1✔
45

46
log = logging.getLogger(__name__)
1✔
47

48

49
@dataclass
1✔
50
class _StepScore:
1✔
51
    """Internal score representation for step classification."""
52

53
    step_number_candidate: Candidate
54
    """The step number candidate this step is associated with."""
1✔
55

56
    parts_list_candidate: Candidate | None
57
    """The parts list candidate paired with this step (if any)."""
1✔
58

59
    has_parts_list: bool
60
    """Whether this step has an associated parts list."""
1✔
61

62
    step_proximity_score: float
63
    """Score based on proximity to the PartsList above (0.0-1.0).
1✔
64
    1.0 for closest proximity, 0.0 if very far. 0.0 if no parts list."""
65

66
    step_alignment_score: float
67
    """Score based on left-edge alignment with PartsList above (0.0-1.0).
1✔
68
    1.0 is perfect alignment, 0.0 is very misaligned. 0.0 if no parts list."""
69

70
    diagram_area: float
71
    """Area of the diagram region."""
1✔
72

73
    def pairing_score(self) -> float:
1✔
74
        """Calculate pairing quality score (average of proximity and alignment)."""
75
        if not self.has_parts_list:
1✔
76
            return 0.0
1✔
77
        return (self.step_proximity_score + self.step_alignment_score) / 2.0
1✔
78

79
    def sort_key(self) -> tuple[float, int]:
1✔
80
        """Return a tuple for sorting candidates.
81

82
        We prefer:
83
        1. Higher pairing scores (better StepNumber-PartsList match)
84
        2. Lower step number values (to break ties and maintain order)
85
        """
86
        # Extract step number value from candidate's source block
87
        step_num_candidate = self.step_number_candidate
1✔
88

89
        # Assume single source block for step number
90
        if step_num_candidate.source_blocks and isinstance(
1✔
91
            step_num_candidate.source_blocks[0], Text
92
        ):
93
            text_block = step_num_candidate.source_blocks[0]
1✔
94
            step_value = extract_step_number_value(text_block.text)
1✔
95
            if step_value is not None:
1✔
96
                return (-self.pairing_score(), step_value)
1✔
97

UNCOV
98
        return (-self.pairing_score(), 0)  # Fallback if value cannot be extracted
×
99

100

101
@dataclass(frozen=True)
1✔
102
class StepClassifier(LabelClassifier):
1✔
103
    """Classifier for complete Step structures."""
104

105
    outputs = frozenset({"step"})
1✔
106
    requires = frozenset({"step_number", "parts_list"})
1✔
107

108
    def score(self, result: ClassificationResult) -> None:
1✔
109
        """Score step pairings and create candidates WITHOUT construction."""
110
        page_data = result.page_data
1✔
111

112
        # Get step number and parts list candidates (not constructed elements)
113
        step_candidates = result.get_scored_candidates(
1✔
114
            "step_number", valid_only=False, exclude_failed=True
115
        )
116

117
        if not step_candidates:
1✔
118
            return
1✔
119

120
        # Get parts_list candidates
121
        parts_list_candidates = result.get_scored_candidates(
1✔
122
            "parts_list",
123
            valid_only=False,
124
            exclude_failed=True,
125
        )
126

127
        log.debug(
1✔
128
            "[step] page=%s step_candidates=%d parts_list_candidates=%d",
129
            page_data.page_number,
130
            len(step_candidates),
131
            len(parts_list_candidates),
132
        )
133

134
        # Create all possible Step candidates for pairings
135
        all_candidates: list[Candidate] = []
1✔
136
        for step_candidate in step_candidates:
1✔
137
            # Create candidates for this StepNumber paired with each PartsList
138
            for parts_list_candidate in parts_list_candidates:
1✔
139
                candidate = self._create_step_candidate(
1✔
140
                    step_candidate, parts_list_candidate, result
141
                )
142
                if candidate:
1✔
143
                    all_candidates.append(candidate)
1✔
144

145
            # Also create a candidate with no PartsList (fallback)
146
            candidate = self._create_step_candidate(step_candidate, None, result)
1✔
147
            if candidate:
1✔
148
                all_candidates.append(candidate)
1✔
149

150
        # Greedily select the best candidates (deduplication)
151
        deduplicated_candidates = self._deduplicate_candidates(all_candidates)
1✔
152

153
        # Add the deduplicated candidates to the result
154
        for candidate in deduplicated_candidates:
1✔
155
            result.add_candidate("step", candidate)
1✔
156

157
        log.debug(
1✔
158
            "[step] Created %d deduplicated step candidates (from %d possibilities)",
159
            len(deduplicated_candidates),
160
            len(all_candidates),
161
        )
162

163
    def construct(self, result: ClassificationResult) -> None:
1✔
164
        """Construct Step elements from candidates."""
UNCOV
165
        candidates = result.get_candidates("step")
×
UNCOV
166
        for candidate in candidates:
×
167
            try:
×
UNCOV
168
                elem = self.construct_candidate(candidate, result)
×
UNCOV
169
                candidate.constructed = elem
×
UNCOV
170
            except Exception as e:
×
UNCOV
171
                candidate.failure_reason = str(e)
×
172

173
    def construct_candidate(
1✔
174
        self, candidate: Candidate, result: ClassificationResult
175
    ) -> LegoPageElements:
176
        """Construct a Step element from a single candidate."""
177
        score = candidate.score_details
1✔
178
        assert isinstance(score, _StepScore)
1✔
179

180
        # Validate and extract step number from parent candidate
181
        step_num_candidate = score.step_number_candidate
1✔
182

183
        step_num_elem = result.construct_candidate(step_num_candidate)
1✔
184
        assert isinstance(step_num_elem, StepNumber)
1✔
185
        step_num = step_num_elem
1✔
186

187
        # Validate and extract parts list from parent candidate (if present)
188
        parts_list = None
1✔
189
        if score.parts_list_candidate:
1✔
190
            parts_list_candidate = score.parts_list_candidate
1✔
191
            parts_list_elem = result.construct_candidate(parts_list_candidate)
1✔
192
            assert isinstance(parts_list_elem, PartsList)
1✔
193
            parts_list = parts_list_elem
1✔
194

195
        # Identify diagram region
196
        diagram_bbox = self._identify_diagram_region(
1✔
197
            step_num.bbox, parts_list.bbox if parts_list else None, result
198
        )
199

200
        # Build Step
201
        diagram = Diagram(bbox=diagram_bbox)
1✔
202
        return Step(
1✔
203
            bbox=self._compute_step_bbox(step_num, parts_list, diagram),
204
            step_number=step_num,
205
            parts_list=parts_list or PartsList(bbox=step_num.bbox, parts=[]),
206
            diagram=diagram,
207
        )
208

209
    def _create_step_candidate(
1✔
210
        self,
211
        step_candidate: Candidate,
212
        parts_list_candidate: Candidate | None,
213
        result: ClassificationResult,
214
    ) -> Candidate | None:
215
        """Create a Step candidate WITHOUT construction.
216

217
        Args:
218
            step_candidate: The StepNumber candidate for this step
219
            parts_list_candidate: The PartsList candidate to pair with (or None)
220
            result: Classification result
221

222
        Returns:
223
            The created Candidate with score but no construction
224
        """
225
        ABOVE_EPS = 2.0  # Small epsilon for "above" check
1✔
226
        ALIGNMENT_THRESHOLD_MULTIPLIER = 1.0  # Max horizontal offset
1✔
227
        DISTANCE_THRESHOLD_MULTIPLIER = 1.0  # Max vertical distance
1✔
228

229
        step_bbox = step_candidate.bbox
1✔
230
        parts_list_bbox = parts_list_candidate.bbox if parts_list_candidate else None
1✔
231

232
        # Calculate pairing scores if there's a parts_list above the step
233
        proximity_score = 0.0
1✔
234
        alignment_score = 0.0
1✔
235

236
        if (
1✔
237
            parts_list_bbox is not None
238
            and parts_list_bbox.y1 <= step_bbox.y0 + ABOVE_EPS
239
        ):
240
            # Calculate distance (how far apart vertically)
241
            distance = step_bbox.y0 - parts_list_bbox.y1
1✔
242

243
            # Calculate proximity score
244
            max_distance = step_bbox.height * DISTANCE_THRESHOLD_MULTIPLIER
1✔
245
            if max_distance > 0:
1✔
246
                proximity_score = max(0.0, 1.0 - (distance / max_distance))
1✔
247

248
            # Calculate alignment score (how well left edges align)
249
            max_alignment_diff = step_bbox.width * ALIGNMENT_THRESHOLD_MULTIPLIER
1✔
250
            left_diff = abs(parts_list_bbox.x0 - step_bbox.x0)
1✔
251
            if max_alignment_diff > 0:
1✔
252
                alignment_score = max(0.0, 1.0 - (left_diff / max_alignment_diff))
1✔
253

254
        # Estimate diagram bbox for scoring purposes
255
        diagram_bbox = self._identify_diagram_region(step_bbox, parts_list_bbox, result)
1✔
256

257
        # Create score object with candidate references
258
        score = _StepScore(
1✔
259
            step_number_candidate=step_candidate,
260
            parts_list_candidate=parts_list_candidate,
261
            has_parts_list=parts_list_candidate is not None,
262
            step_proximity_score=proximity_score,
263
            step_alignment_score=alignment_score,
264
            diagram_area=diagram_bbox.area,
265
        )
266

267
        # Calculate combined bbox for the candidate
268
        bboxes = [step_bbox, diagram_bbox]
1✔
269
        if parts_list_bbox:
1✔
270
            bboxes.append(parts_list_bbox)
1✔
271
        combined_bbox = BBox.union_all(bboxes)
1✔
272

273
        # Create candidate WITHOUT construction
274
        return Candidate(
1✔
275
            bbox=combined_bbox,
276
            label="step",
277
            score=score.pairing_score(),
278
            score_details=score,
279
            constructed=None,
280
            source_blocks=[],
281
            failure_reason=None,
282
        )
283

284
    def _identify_diagram_region(
1✔
285
        self,
286
        step_bbox: BBox,
287
        parts_list_bbox: BBox | None,
288
        result: ClassificationResult,
289
    ) -> BBox:
290
        """Identify the diagram region for a step.
291

292
        The diagram is typically the large area below the step number and parts list.
293
        For now, we create a simple heuristic-based region.
294

295
        Args:
296
            step_bbox: The step number bbox
297
            parts_list_bbox: The associated parts list bbox (if any)
298
            result: Classification result containing page_data
299

300
        Returns:
301
            BBox representing the diagram region
302
        """
303
        page_data = result.page_data
1✔
304
        # Simple heuristic: use the step number's bbox as a starting point
305
        # In the future, we should look for actual drawing elements below the step
306

307
        # Start with step number position
308
        x0 = step_bbox.x0
1✔
309
        y0 = step_bbox.y1  # Below the step number
1✔
310

311
        # If there's a parts list, the diagram should be below it
312
        if parts_list_bbox:
1✔
313
            y0 = max(y0, parts_list_bbox.y1)
1✔
314

315
        # Extend to a reasonable area (placeholder logic)
316
        # TODO: Find actual drawing elements and use their bounds
317
        page_bbox = page_data.bbox
1✔
318
        assert page_bbox is not None
1✔
319

320
        # Use the rest of the page width and height as a simple approximation
321
        x1 = page_bbox.x1
1✔
322
        y1 = page_bbox.y1
1✔
323

324
        # Create a bbox for the diagram region
325
        return BBox(x0=x0, y0=y0, x1=x1, y1=y1)
1✔
326

327
    def _compute_step_bbox(
1✔
328
        self,
329
        step_num: StepNumber,
330
        parts_list: PartsList | None,
331
        diagram: Diagram,
332
    ) -> BBox:
333
        """Compute the overall bounding box for the Step.
334

335
        This encompasses the step number, parts list (if any), and diagram.
336

337
        Args:
338
            step_num: The step number element
339
            parts_list: The parts list (if any)
340
            diagram: The diagram element
341

342
        Returns:
343
            Combined bounding box
344
        """
345
        bboxes = [step_num.bbox, diagram.bbox]
1✔
346
        if parts_list:
1✔
347
            bboxes.append(parts_list.bbox)
1✔
348

349
        return BBox.union_all(bboxes)
1✔
350

351
    def _deduplicate_candidates(self, candidates: list[Candidate]) -> list[Candidate]:
1✔
352
        """Greedily select the best Step candidates.
353

354
        Ensures each StepNumber value and each PartsList is used at most once.
355

356
        Args:
357
            candidates: All possible Step candidates
358

359
        Returns:
360
            Deduplicated list of Step candidates
361
        """
362
        # Sort candidates by score (highest first)
363
        sorted_candidates = sorted(
1✔
364
            candidates,
365
            key=lambda c: c.score_details.sort_key(),
366
        )
367

368
        # Track which StepNumber values and PartsLists have been used
369
        used_step_values: set[int] = set()
1✔
370
        used_parts_list_ids: set[int] = set()
1✔
371
        selected: list[Candidate] = []
1✔
372

373
        # Greedily select winners
374
        for candidate in sorted_candidates:
1✔
375
            # Get step info from score_details (candidates not yet constructed)
376
            assert isinstance(candidate.score_details, _StepScore)
1✔
377
            score = candidate.score_details
1✔
378

379
            # Extract step number value from parent candidate source block
380
            step_num_candidate = score.step_number_candidate
1✔
381

382
            # Extract step value from text block
383
            if not step_num_candidate.source_blocks:
1✔
UNCOV
384
                continue
×
385
            text_block = step_num_candidate.source_blocks[0]
1✔
386
            if not isinstance(text_block, Text):
1✔
UNCOV
387
                continue
×
388

389
            step_value = extract_step_number_value(text_block.text)
1✔
390
            if step_value is None:
1✔
UNCOV
391
                continue
×
392

393
            # Extract parts list from parent candidate (if present)
394
            parts_list_candidate = score.parts_list_candidate
1✔
395

396
            # Skip if this step number value is already used
397
            if step_value in used_step_values:
1✔
398
                log.debug(
1✔
399
                    "[step] Skipping candidate for step %d - value already used",
400
                    step_value,
401
                )
402
                continue
1✔
403

404
            # Skip if this parts_list is already used (if it has parts)
405
            if parts_list_candidate is not None:
1✔
406
                # Check if parts list has parts (look at its score details)
407
                has_parts = False
1✔
408
                if hasattr(parts_list_candidate.score_details, "part_candidates"):
1✔
409
                    has_parts = (
1✔
410
                        len(parts_list_candidate.score_details.part_candidates) > 0
411
                    )
412

413
                if has_parts:
1✔
414
                    parts_list_id = id(parts_list_candidate)
1✔
415
                    if parts_list_id in used_parts_list_ids:
1✔
416
                        log.debug(
1✔
417
                            "[step] Skipping candidate for step %d - "
418
                            "PartsList candidate already used",
419
                            step_value,
420
                        )
421
                        continue
1✔
422
                    # Claim this parts_list
423
                    used_parts_list_ids.add(parts_list_id)
1✔
424

425
            # Select this candidate
426
            selected.append(candidate)
1✔
427
            used_step_values.add(step_value)
1✔
428

429
            log.debug(
1✔
430
                "[step] Selected step %d (parts_list=%s, pairing_score=%.2f)",
431
                step_value,
432
                "yes" if parts_list_candidate is not None else "no",
433
                score.pairing_score(),
434
            )
435

436
        return selected
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc