• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19751674072

28 Nov 2025 01:40AM UTC coverage: 89.023% (-0.8%) from 89.847%
19751674072

push

github

bramp
refactor(classifier): reorganize text modules and break circular dependencies

Major Changes:
- Created text/ subdirectory for text-related classifier modules
- Moved text_histogram, text_extractors, font_size_hints to text/
- Created constants.py to resolve circular dependency issue

Module Organization:
- classifier/text/__init__.py: Package exports for text modules
- classifier/text/text_histogram.py: TextHistogram class
- classifier/text/text_extractors.py: Text extraction functions
- classifier/text/font_size_hints.py: FontSizeHints class

Circular Dependency Resolution:
- Created classifier/constants.py with CATALOG_ELEMENT_ID_THRESHOLD
- Removed ClassVar from PageHintCollection
- Updated font_size_hints.py and page_hint_collection.py to import from constants
- Fixed package-level circular import by importing TextHistogram directly from module
- Added TODO to consider moving constant to ClassifierConfig

Bug Fixes:
- Fixed DrawableItem frozen model issue in drawing.py
- Create new instances with depth instead of mutating frozen objects

Import Updates:
- Updated all imports across ~15 files to use new module paths
- Updated classifier/__init__.py to re-export text module classes

Tests:
- All tests passing (42/42 test files)
- Type checking passes
- Code formatted with ruff

32 of 32 new or added lines in 23 files covered. (100.0%)

180 existing lines in 19 files now uncovered.

7429 of 8345 relevant lines covered (89.02%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.62
/src/build_a_long/pdf_extract/classifier/steps/step_classifier.py
1
"""
2
Step classifier.
3

4
Purpose
5
-------
6
Identify complete Step structures by combining step_number, parts_list, and diagram
7
elements. A Step represents a single building instruction comprising:
8
- A StepNumber label
9
- An optional PartsList (the parts needed for this step)
10
- A Diagram (the main instruction graphic showing what to build)
11

12
We look for step_numbers and attempt to pair them with nearby parts_lists and
13
identify the appropriate diagram region for each step.
14

15
Debugging
16
---------
17
Set environment variables to aid investigation without code changes:
18

19
- LOG_LEVEL=DEBUG
20
    Enables DEBUG-level logging (if not already configured by caller).
21
"""
22

23
import logging
1✔
24
from dataclasses import dataclass
1✔
25

26
from build_a_long.pdf_extract.classifier.candidate import Candidate
1✔
27
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
28
    ClassificationResult,
29
)
30
from build_a_long.pdf_extract.classifier.label_classifier import (
1✔
31
    LabelClassifier,
32
)
33
from build_a_long.pdf_extract.classifier.parts.parts_list_classifier import (
1✔
34
    _PartsListScore,
35
)
36
from build_a_long.pdf_extract.classifier.score import Score, Weight
1✔
37
from build_a_long.pdf_extract.classifier.text import (
1✔
38
    extract_step_number_value,
39
)
40
from build_a_long.pdf_extract.extractor.bbox import BBox
1✔
41
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
42
    Diagram,
43
    PartsList,
44
    Step,
45
    StepNumber,
46
)
47
from build_a_long.pdf_extract.extractor.page_blocks import Text
1✔
48

49
log = logging.getLogger(__name__)
1✔
50

51

52
class _StepScore(Score):
1✔
53
    """Internal score representation for step classification."""
54

55
    step_number_candidate: Candidate
56
    """The step number candidate this step is associated with."""
1✔
57

58
    parts_list_candidate: Candidate | None
59
    """The parts list candidate paired with this step (if any)."""
1✔
60

61
    has_parts_list: bool
62
    """Whether this step has an associated parts list."""
1✔
63

64
    step_proximity_score: float
65
    """Score based on proximity to the PartsList above (0.0-1.0).
1✔
66
    1.0 for closest proximity, 0.0 if very far. 0.0 if no parts list."""
67

68
    step_alignment_score: float
69
    """Score based on left-edge alignment with PartsList above (0.0-1.0).
1✔
70
    1.0 is perfect alignment, 0.0 is very misaligned. 0.0 if no parts list."""
71

72
    diagram_area: float
73
    """Area of the diagram region."""
1✔
74

75
    def score(self) -> Weight:
1✔
76
        """Return the pairing score as the main score."""
UNCOV
77
        return self.pairing_score()
×
78

79
    def pairing_score(self) -> float:
1✔
80
        """Calculate pairing quality score (average of proximity and alignment)."""
81
        if not self.has_parts_list:
1✔
82
            return 0.0
1✔
83
        return (self.step_proximity_score + self.step_alignment_score) / 2.0
1✔
84

85
    def sort_key(self) -> tuple[float, int]:
1✔
86
        """Return a tuple for sorting candidates.
87

88
        We prefer:
89
        1. Higher pairing scores (better StepNumber-PartsList match)
90
        2. Lower step number values (to break ties and maintain order)
91
        """
92
        # Extract step number value from candidate's source block
93
        step_num_candidate = self.step_number_candidate
1✔
94

95
        # Assume single source block for step number
96
        if step_num_candidate.source_blocks and isinstance(
1✔
97
            step_num_candidate.source_blocks[0], Text
98
        ):
99
            text_block = step_num_candidate.source_blocks[0]
1✔
100
            step_value = extract_step_number_value(text_block.text)
1✔
101
            if step_value is not None:
1✔
102
                return (-self.pairing_score(), step_value)
1✔
103

UNCOV
104
        return (-self.pairing_score(), 0)  # Fallback if value cannot be extracted
×
105

106

107
@dataclass(frozen=True)
1✔
108
class StepClassifier(LabelClassifier):
1✔
109
    """Classifier for complete Step structures."""
110

111
    output = "step"
1✔
112
    requires = frozenset({"step_number", "parts_list"})
1✔
113

114
    def _score(self, result: ClassificationResult) -> None:
1✔
115
        """Score step pairings and create candidates."""
116
        page_data = result.page_data
1✔
117

118
        # Get step number and parts list candidates (not constructed elements)
119
        step_candidates = result.get_scored_candidates(
1✔
120
            "step_number", valid_only=False, exclude_failed=True
121
        )
122

123
        if not step_candidates:
1✔
124
            return
1✔
125

126
        # Get parts_list candidates
127
        parts_list_candidates = result.get_scored_candidates(
1✔
128
            "parts_list",
129
            valid_only=False,
130
            exclude_failed=True,
131
        )
132

133
        log.debug(
1✔
134
            "[step] page=%s step_candidates=%d parts_list_candidates=%d",
135
            page_data.page_number,
136
            len(step_candidates),
137
            len(parts_list_candidates),
138
        )
139

140
        # Create all possible Step candidates for pairings
141
        all_candidates: list[Candidate] = []
1✔
142
        for step_candidate in step_candidates:
1✔
143
            # Create candidates for this StepNumber paired with each PartsList
144
            for parts_list_candidate in parts_list_candidates:
1✔
145
                candidate = self._create_step_candidate(
1✔
146
                    step_candidate, parts_list_candidate, result
147
                )
148
                if candidate:
1✔
149
                    all_candidates.append(candidate)
1✔
150

151
            # Also create a candidate with no PartsList (fallback)
152
            candidate = self._create_step_candidate(step_candidate, None, result)
1✔
153
            if candidate:
1✔
154
                all_candidates.append(candidate)
1✔
155

156
        # Greedily select the best candidates (deduplication)
157
        deduplicated_candidates = self._deduplicate_candidates(all_candidates)
1✔
158

159
        # Add the deduplicated candidates to the result
160
        for candidate in deduplicated_candidates:
1✔
161
            result.add_candidate(candidate)
1✔
162

163
        log.debug(
1✔
164
            "[step] Created %d deduplicated step candidates (from %d possibilities)",
165
            len(deduplicated_candidates),
166
            len(all_candidates),
167
        )
168

169
    def build(self, candidate: Candidate, result: ClassificationResult) -> Step:
1✔
170
        """Construct a Step element from a single candidate."""
171
        score = candidate.score_details
1✔
172
        assert isinstance(score, _StepScore)
1✔
173

174
        # Validate and extract step number from parent candidate
175
        step_num_candidate = score.step_number_candidate
1✔
176

177
        step_num_elem = result.build(step_num_candidate)
1✔
178
        assert isinstance(step_num_elem, StepNumber)
1✔
179
        step_num = step_num_elem
1✔
180

181
        # Validate and extract parts list from parent candidate (if present)
182
        parts_list = None
1✔
183
        if score.parts_list_candidate:
1✔
184
            parts_list_candidate = score.parts_list_candidate
1✔
185
            parts_list_elem = result.build(parts_list_candidate)
1✔
186
            assert isinstance(parts_list_elem, PartsList)
1✔
187
            parts_list = parts_list_elem
1✔
188

189
        # Identify diagram region
190
        diagram_bbox = self._identify_diagram_region(
1✔
191
            step_num.bbox, parts_list.bbox if parts_list else None, result
192
        )
193

194
        # Build Step
195
        diagram = Diagram(bbox=diagram_bbox)
1✔
196
        return Step(
1✔
197
            bbox=self._compute_step_bbox(step_num, parts_list, diagram),
198
            step_number=step_num,
199
            parts_list=parts_list or PartsList(bbox=step_num.bbox, parts=[]),
200
            diagram=diagram,
201
        )
202

203
    def _create_step_candidate(
1✔
204
        self,
205
        step_candidate: Candidate,
206
        parts_list_candidate: Candidate | None,
207
        result: ClassificationResult,
208
    ) -> Candidate | None:
209
        """Create a Step candidate.
210

211
        Args:
212
            step_candidate: The StepNumber candidate for this step
213
            parts_list_candidate: The PartsList candidate to pair with (or None)
214
            result: Classification result
215

216
        Returns:
217
            The created Candidate with score but no construction
218
        """
219
        ABOVE_EPS = 2.0  # Small epsilon for "above" check
1✔
220
        ALIGNMENT_THRESHOLD_MULTIPLIER = 1.0  # Max horizontal offset
1✔
221
        DISTANCE_THRESHOLD_MULTIPLIER = 1.0  # Max vertical distance
1✔
222

223
        step_bbox = step_candidate.bbox
1✔
224
        parts_list_bbox = parts_list_candidate.bbox if parts_list_candidate else None
1✔
225

226
        # Calculate pairing scores if there's a parts_list above the step
227
        proximity_score = 0.0
1✔
228
        alignment_score = 0.0
1✔
229

230
        if (
1✔
231
            parts_list_bbox is not None
232
            and parts_list_bbox.y1 <= step_bbox.y0 + ABOVE_EPS
233
        ):
234
            # Calculate distance (how far apart vertically)
235
            distance = step_bbox.y0 - parts_list_bbox.y1
1✔
236

237
            # Calculate proximity score
238
            max_distance = step_bbox.height * DISTANCE_THRESHOLD_MULTIPLIER
1✔
239
            if max_distance > 0:
1✔
240
                proximity_score = max(0.0, 1.0 - (distance / max_distance))
1✔
241

242
            # Calculate alignment score (how well left edges align)
243
            max_alignment_diff = step_bbox.width * ALIGNMENT_THRESHOLD_MULTIPLIER
1✔
244
            left_diff = abs(parts_list_bbox.x0 - step_bbox.x0)
1✔
245
            if max_alignment_diff > 0:
1✔
246
                alignment_score = max(0.0, 1.0 - (left_diff / max_alignment_diff))
1✔
247

248
        # Estimate diagram bbox for scoring purposes
249
        diagram_bbox = self._identify_diagram_region(step_bbox, parts_list_bbox, result)
1✔
250

251
        # Create score object with candidate references
252
        score = _StepScore(
1✔
253
            step_number_candidate=step_candidate,
254
            parts_list_candidate=parts_list_candidate,
255
            has_parts_list=parts_list_candidate is not None,
256
            step_proximity_score=proximity_score,
257
            step_alignment_score=alignment_score,
258
            diagram_area=diagram_bbox.area,
259
        )
260

261
        # Calculate combined bbox for the candidate
262
        bboxes = [step_bbox, diagram_bbox]
1✔
263
        if parts_list_bbox:
1✔
264
            bboxes.append(parts_list_bbox)
1✔
265
        combined_bbox = BBox.union_all(bboxes)
1✔
266

267
        # Create candidate
268
        return Candidate(
1✔
269
            bbox=combined_bbox,
270
            label="step",
271
            score=score.pairing_score(),
272
            score_details=score,
273
            source_blocks=[],
274
        )
275

276
    def _identify_diagram_region(
1✔
277
        self,
278
        step_bbox: BBox,
279
        parts_list_bbox: BBox | None,
280
        result: ClassificationResult,
281
    ) -> BBox:
282
        """Identify the diagram region for a step.
283

284
        The diagram is typically the large area below the step number and parts list.
285
        For now, we create a simple heuristic-based region.
286

287
        Args:
288
            step_bbox: The step number bbox
289
            parts_list_bbox: The associated parts list bbox (if any)
290
            result: Classification result containing page_data
291

292
        Returns:
293
            BBox representing the diagram region
294
        """
295
        page_data = result.page_data
1✔
296
        # Simple heuristic: use the step number's bbox as a starting point
297
        # In the future, we should look for actual drawing elements below the step
298

299
        # Start with step number position
300
        x0 = step_bbox.x0
1✔
301
        y0 = step_bbox.y1  # Below the step number
1✔
302

303
        # If there's a parts list, the diagram should be below it
304
        if parts_list_bbox:
1✔
305
            y0 = max(y0, parts_list_bbox.y1)
1✔
306

307
        # Extend to a reasonable area (placeholder logic)
308
        # TODO: Find actual drawing elements and use their bounds
309
        page_bbox = page_data.bbox
1✔
310
        assert page_bbox is not None
1✔
311

312
        # Use the rest of the page width and height as a simple approximation
313
        x1 = page_bbox.x1
1✔
314
        y1 = page_bbox.y1
1✔
315

316
        # Create a bbox for the diagram region
317
        return BBox(x0=x0, y0=y0, x1=x1, y1=y1)
1✔
318

319
    def _compute_step_bbox(
1✔
320
        self,
321
        step_num: StepNumber,
322
        parts_list: PartsList | None,
323
        diagram: Diagram,
324
    ) -> BBox:
325
        """Compute the overall bounding box for the Step.
326

327
        This encompasses the step number, parts list (if any), and diagram.
328

329
        Args:
330
            step_num: The step number element
331
            parts_list: The parts list (if any)
332
            diagram: The diagram element
333

334
        Returns:
335
            Combined bounding box
336
        """
337
        bboxes = [step_num.bbox, diagram.bbox]
1✔
338
        if parts_list:
1✔
339
            bboxes.append(parts_list.bbox)
1✔
340

341
        return BBox.union_all(bboxes)
1✔
342

343
    def _deduplicate_candidates(self, candidates: list[Candidate]) -> list[Candidate]:
1✔
344
        """Greedily select the best Step candidates.
345

346
        Ensures each StepNumber value and each PartsList is used at most once.
347

348
        Args:
349
            candidates: All possible Step candidates
350

351
        Returns:
352
            Deduplicated list of Step candidates
353
        """
354
        # Sort candidates by score (highest first)
355
        sorted_candidates = sorted(
1✔
356
            candidates,
357
            key=lambda c: (
358
                c.score_details.sort_key()
359
                if isinstance(c.score_details, _StepScore)
360
                else (0.0, 0)
361
            ),
362
        )
363

364
        # Track which StepNumber values and PartsLists have been used
365
        used_step_values: set[int] = set()
1✔
366
        used_parts_list_ids: set[int] = set()
1✔
367
        selected: list[Candidate] = []
1✔
368

369
        # Greedily select winners
370
        for candidate in sorted_candidates:
1✔
371
            # Get step info from score_details (candidates not yet constructed)
372
            assert isinstance(candidate.score_details, _StepScore)
1✔
373
            score = candidate.score_details
1✔
374

375
            # Extract step number value from parent candidate source block
376
            step_num_candidate = score.step_number_candidate
1✔
377

378
            # Extract step value from text block
379
            if not step_num_candidate.source_blocks:
1✔
UNCOV
380
                continue
×
381
            text_block = step_num_candidate.source_blocks[0]
1✔
382
            if not isinstance(text_block, Text):
1✔
UNCOV
383
                continue
×
384

385
            step_value = extract_step_number_value(text_block.text)
1✔
386
            if step_value is None:
1✔
UNCOV
387
                continue
×
388

389
            # Extract parts list from parent candidate (if present)
390
            parts_list_candidate = score.parts_list_candidate
1✔
391

392
            # Skip if this step number value is already used
393
            if step_value in used_step_values:
1✔
394
                log.debug(
1✔
395
                    "[step] Skipping candidate for step %d - value already used",
396
                    step_value,
397
                )
398
                continue
1✔
399

400
            # Skip if this parts_list is already used (if it has parts)
401
            if parts_list_candidate is not None:
1✔
402
                # Check if parts list has parts (look at its score details)
403
                has_parts = False
1✔
404
                if isinstance(parts_list_candidate.score_details, _PartsListScore):
1✔
405
                    has_parts = (
1✔
406
                        len(parts_list_candidate.score_details.part_candidates) > 0
407
                    )
408

409
                if has_parts:
1✔
410
                    parts_list_id = id(parts_list_candidate)
1✔
411
                    if parts_list_id in used_parts_list_ids:
1✔
412
                        log.debug(
1✔
413
                            "[step] Skipping candidate for step %d - "
414
                            "PartsList candidate already used",
415
                            step_value,
416
                        )
417
                        continue
1✔
418
                    # Claim this parts_list
419
                    used_parts_list_ids.add(parts_list_id)
1✔
420

421
            # Select this candidate
422
            selected.append(candidate)
1✔
423
            used_step_values.add(step_value)
1✔
424

425
            log.debug(
1✔
426
                "[step] Selected step %d (parts_list=%s, pairing_score=%.2f)",
427
                step_value,
428
                "yes" if parts_list_candidate is not None else "no",
429
                score.pairing_score(),
430
            )
431

432
        return selected
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc