• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19724286935

27 Nov 2025 03:27AM UTC coverage: 89.068% (-2.2%) from 91.307%
19724286935

push

github

bramp
test(pdf_extract:classifier): Refactor tests to use fixtures and remove integration test

This commit refactors the classifier test suite to improve maintainability and isolation.

Key changes:
- **Fixtures:** Introduced `conftest.py` with `classifier` and `candidate_factory` fixtures to streamline test setup and candidate creation.
- **Unit Test Conversion:** Updated `parts_classifier_test.py`, `step_classifier_test.py`, `page_number_classifier_test.py`, `part_count_classifier_test.py`, and `step_number_classifier_test.py` to use these fixtures, removing dependency on `classify_elements` and making them true unit tests.
- **Integration Test Removal:** Deleted `test_font_size_integration.py` (and its temporary rename `font_size_scoring_test.py`) as its logic has been moved into the respective classifier unit tests.
- **Cleanup:** Removed debug prints and ensured strict type checking compliance.

382 of 389 new or added lines in 9 files covered. (98.2%)

291 existing lines in 28 files now uncovered.

7585 of 8516 relevant lines covered (89.07%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.55
/src/build_a_long/pdf_extract/classifier/progress_bar_classifier.py
1
"""
2
Progress bar classifier.
3

4
Purpose
5
-------
6
Identify progress bars at the bottom of instruction pages. Progress bars are
7
typically horizontal elements spanning most of the page width, located near
8
the page number at the bottom of the page.
9

10
Heuristic
11
---------
12
- Look for Drawing/Image elements near the bottom of the page
13
- Must span a significant portion of the page width (e.g., >50%)
14
- Should be relatively thin vertically (height << width)
15
- Located near the page number or bottom margin
16
- May consist of multiple adjacent elements forming a single visual bar
17

18
Debugging
19
---------
20
Enable with `LOG_LEVEL=DEBUG` for structured logs.
21
"""
22

23
from __future__ import annotations
1✔
24

25
import logging
1✔
26
from dataclasses import dataclass
1✔
27

28
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
29
    Candidate,
30
    ClassificationResult,
31
    ClassifierConfig,
32
)
33
from build_a_long.pdf_extract.classifier.label_classifier import (
1✔
34
    LabelClassifier,
35
)
36
from build_a_long.pdf_extract.extractor.bbox import BBox
1✔
37
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
38
    LegoPageElements,
39
    ProgressBar,
40
)
41
from build_a_long.pdf_extract.extractor.page_blocks import (
1✔
42
    Drawing,
43
    Image,
44
)
45

46
log = logging.getLogger(__name__)
1✔
47

48

49
@dataclass
1✔
50
class _ProgressBarScore:
1✔
51
    """Internal score representation for progress bar classification."""
52

53
    position_score: float
1✔
54
    """Score based on position at bottom of page (0.0-1.0)."""
1✔
55

56
    width_score: float
1✔
57
    """Score based on how much of the page width it spans (0.0-1.0)."""
1✔
58

59
    aspect_ratio_score: float
1✔
60
    """Score based on horizontal aspect ratio (wide and thin) (0.0-1.0)."""
1✔
61

62
    original_width: float
1✔
63
    """Original width before clipping to page boundaries."""
1✔
64

65
    clipped_bbox: BBox
1✔
66
    """Bounding box clipped to page boundaries."""
1✔
67

68
    def combined_score(self, config: ClassifierConfig) -> float:
1✔
69
        """Calculate final weighted score from components."""
70
        # Equal weighting for all components
71
        score = (self.position_score + self.width_score + self.aspect_ratio_score) / 3.0
1✔
72
        return score
1✔
73

74

75
@dataclass(frozen=True)
1✔
76
class ProgressBarClassifier(LabelClassifier):
1✔
77
    """Classifier for progress bars on instruction pages."""
78

79
    outputs = frozenset({"progress_bar"})
1✔
80
    requires = frozenset({"page_number"})
1✔
81

82
    def score(self, result: ClassificationResult) -> None:
1✔
83
        """Score Drawing/Image elements and create candidates WITHOUT construction."""
84
        page_data = result.page_data
1✔
85
        page_bbox = page_data.bbox
1✔
86
        assert page_bbox is not None
1✔
87

88
        # Get page number location if available to help with positioning
89
        page_number_bbox = self._get_page_number_bbox(result)
1✔
90

91
        for block in page_data.blocks:
1✔
92
            # Only consider Drawing and Image elements
93
            if not isinstance(block, Drawing | Image):
1✔
94
                continue
1✔
95

96
            #  Score the block
97
            position_score = self._score_bottom_position(
1✔
98
                block.bbox, page_bbox, page_number_bbox
99
            )
100

101
            # Skip if not in bottom 20% of page
102
            if position_score == 0.0:
1✔
103
                continue
1✔
104

105
            width_score = self._score_width_coverage(block.bbox, page_bbox)
1✔
106
            aspect_ratio_score = self._score_aspect_ratio(block.bbox)
1✔
107

108
            # Must have minimum width (at least 30% of page width)
109
            if width_score == 0.0:
1✔
110
                continue
1✔
111

112
            # Must have aspect ratio suggesting horizontal bar (at least 3:1)
113
            if aspect_ratio_score == 0.0:
1✔
114
                continue
1✔
115

116
            # Clip the bbox to page boundaries
117
            original_width = block.bbox.width
1✔
118
            clipped_bbox = block.bbox.clip_to(page_bbox)
1✔
119

120
            score_details = _ProgressBarScore(
1✔
121
                position_score=position_score,
122
                width_score=width_score,
123
                aspect_ratio_score=aspect_ratio_score,
124
                original_width=original_width,
125
                clipped_bbox=clipped_bbox,
126
            )
127

128
            combined = score_details.combined_score(self.config)
1✔
129

130
            # Store candidate WITHOUT construction
131
            result.add_candidate(
1✔
132
                "progress_bar",
133
                Candidate(
134
                    bbox=clipped_bbox,
135
                    label="progress_bar",
136
                    score=combined,
137
                    score_details=score_details,
138
                    constructed=None,
139
                    source_blocks=[block],
140
                    failure_reason=None,
141
                ),
142
            )
143

144
    def construct(self, result: ClassificationResult) -> None:
1✔
145
        """Construct ProgressBar elements from candidates."""
UNCOV
146
        candidates = result.get_candidates("progress_bar")
×
UNCOV
147
        for candidate in candidates:
×
UNCOV
148
            try:
×
UNCOV
149
                elem = self.construct_candidate(candidate, result)
×
UNCOV
150
                candidate.constructed = elem
×
UNCOV
151
            except Exception as e:
×
152
                candidate.failure_reason = str(e)
×
153

154
    def construct_candidate(
1✔
155
        self, candidate: Candidate, result: ClassificationResult
156
    ) -> LegoPageElements:
157
        """Construct a ProgressBar element from a single candidate."""
158
        # Get score details
159
        detail_score = candidate.score_details
1✔
160
        assert isinstance(detail_score, _ProgressBarScore)
1✔
161

162
        # Construct the ProgressBar element
163
        return ProgressBar(
1✔
164
            bbox=detail_score.clipped_bbox,
165
            progress=None,
166
            full_width=detail_score.original_width,
167
        )
168

169
    def _get_page_number_bbox(self, result: ClassificationResult) -> BBox | None:
1✔
170
        """Get the bbox of the page number if it has been classified."""
171
        page_number_candidates = result.get_scored_candidates(
1✔
172
            "page_number", valid_only=False, exclude_failed=True
173
        )
174

175
        if page_number_candidates:
1✔
176
            # Assume the highest scoring candidate is the page number
177
            return page_number_candidates[0].bbox
1✔
178

179
        return None
1✔
180

181
    def _score_bottom_position(
1✔
182
        self, bbox: BBox, page_bbox: BBox, page_number_bbox: BBox | None
183
    ) -> float:
184
        """Score based on position at bottom of page.
185

186
        Returns higher scores for elements at the bottom of the page,
187
        especially near the page number.
188
        """
189
        page_height = page_bbox.height
1✔
190
        element_bottom = bbox.y1
1✔
191

192
        # Calculate distance from bottom of page
193
        bottom_distance = page_bbox.y1 - element_bottom
1✔
194
        bottom_margin_ratio = bottom_distance / page_height
1✔
195

196
        # Should be in bottom 20% of page
197
        if bottom_margin_ratio > 0.2:
1✔
198
            return 0.0
1✔
199

200
        # Score based on proximity to bottom (closer = better)
201
        position_score = 1.0 - (bottom_margin_ratio / 0.2)
1✔
202

203
        # Boost score if near page number
204
        if page_number_bbox is not None:
1✔
205
            # Check horizontal distance to page number
206
            horizontal_distance = min(
1✔
207
                abs(bbox.x0 - page_number_bbox.x1),
208
                abs(bbox.x1 - page_number_bbox.x0),
209
            )
210
            if horizontal_distance < page_bbox.width * 0.3:
1✔
211
                position_score = min(1.0, position_score * 1.2)
1✔
212

213
        return min(1.0, position_score)
1✔
214

215
    def _score_width_coverage(self, bbox: BBox, page_bbox: BBox) -> float:
1✔
216
        """Score based on how much of the page width the element spans.
217

218
        Progress bars typically span >50% of the page width.
219
        """
220
        width_ratio = bbox.width / page_bbox.width
1✔
221

222
        # Penalize elements that are too narrow
223
        if width_ratio < 0.3:
1✔
224
            return 0.0
1✔
225

226
        # Score increases with width, maxing at 80% coverage
227
        # (some margin is expected on sides)
228
        if width_ratio >= 0.8:
1✔
229
            return 1.0
1✔
230

231
        # Linear interpolation between 0.3 and 0.8
232
        return (width_ratio - 0.3) / 0.5
1✔
233

234
    def _score_aspect_ratio(self, bbox: BBox) -> float:
1✔
235
        """Score based on aspect ratio (should be wide and thin).
236

237
        Progress bars are typically very wide relative to their height.
238
        """
239
        aspect_ratio = bbox.width / bbox.height if bbox.height > 0 else 0.0
1✔
240

241
        # Progress bars should be wide and thin
242
        # Typical aspect ratio might be 10:1 or higher
243
        if aspect_ratio < 3.0:  # Too square
1✔
244
            return 0.0
1✔
245

246
        if aspect_ratio >= 10.0:  # Good aspect ratio
1✔
247
            return 1.0
1✔
248

249
        # Linear interpolation between 3 and 10
250
        return (aspect_ratio - 3.0) / 7.0
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc