• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19684665991

25 Nov 2025 09:28PM UTC coverage: 90.542% (-0.7%) from 91.259%
19684665991

push

github

bramp
Improve Part string representation formatting

- Only show number field if present (cleaner output)
- Change format from 'Part(count=1x, number=...)' to 'Part(count=1x, number=...)'
- Reduces noise when Part has no number assigned

0 of 2 new or added lines in 1 file covered. (0.0%)

388 existing lines in 32 files now uncovered.

7189 of 7940 relevant lines covered (90.54%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.92
/src/build_a_long/pdf_extract/classifier/progress_bar_classifier.py
1
"""
2
Progress bar classifier.
3

4
Purpose
5
-------
6
Identify progress bars at the bottom of instruction pages. Progress bars are
7
typically horizontal elements spanning most of the page width, located near
8
the page number at the bottom of the page.
9

10
Heuristic
11
---------
12
- Look for Drawing/Image elements near the bottom of the page
13
- Must span a significant portion of the page width (e.g., >50%)
14
- Should be relatively thin vertically (height << width)
15
- Located near the page number or bottom margin
16
- May consist of multiple adjacent elements forming a single visual bar
17

18
Debugging
19
---------
20
Enable with `LOG_LEVEL=DEBUG` for structured logs.
21
"""
22

23
from __future__ import annotations
1✔
24

25
import logging
1✔
26
from dataclasses import dataclass
1✔
27

28
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
29
    Candidate,
30
    ClassificationResult,
31
    ClassifierConfig,
32
)
33
from build_a_long.pdf_extract.classifier.label_classifier import (
1✔
34
    LabelClassifier,
35
)
36
from build_a_long.pdf_extract.extractor.bbox import BBox
1✔
37
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
38
    LegoPageElements,
39
    PageNumber,
40
    ProgressBar,
41
)
42
from build_a_long.pdf_extract.extractor.page_blocks import (
1✔
43
    Drawing,
44
    Image,
45
)
46

47
log = logging.getLogger(__name__)
1✔
48

49

50
@dataclass
1✔
51
class _ProgressBarScore:
1✔
52
    """Internal score representation for progress bar classification."""
53

54
    position_score: float
1✔
55
    """Score based on position at bottom of page (0.0-1.0)."""
1✔
56

57
    width_score: float
1✔
58
    """Score based on how much of the page width it spans (0.0-1.0)."""
1✔
59

60
    aspect_ratio_score: float
1✔
61
    """Score based on horizontal aspect ratio (wide and thin) (0.0-1.0)."""
1✔
62

63
    original_width: float
1✔
64
    """Original width before clipping to page boundaries."""
1✔
65

66
    clipped_bbox: BBox
1✔
67
    """Bounding box clipped to page boundaries."""
1✔
68

69
    def combined_score(self, config: ClassifierConfig) -> float:
1✔
70
        """Calculate final weighted score from components."""
71
        # Equal weighting for all components
72
        score = (self.position_score + self.width_score + self.aspect_ratio_score) / 3.0
1✔
73
        return score
1✔
74

75

76
@dataclass(frozen=True)
1✔
77
class ProgressBarClassifier(LabelClassifier):
1✔
78
    """Classifier for progress bars on instruction pages."""
79

80
    outputs = frozenset({"progress_bar"})
1✔
81
    requires = frozenset({"page_number"})
1✔
82

83
    def score(self, result: ClassificationResult) -> None:
1✔
84
        """Score Drawing/Image elements and create candidates WITHOUT construction."""
85
        page_data = result.page_data
1✔
86
        page_bbox = page_data.bbox
1✔
87
        assert page_bbox is not None
1✔
88

89
        # Get page number location if available to help with positioning
90
        page_number_bbox = self._get_page_number_bbox(result)
1✔
91

92
        for block in page_data.blocks:
1✔
93
            # Only consider Drawing and Image elements
94
            if not isinstance(block, Drawing | Image):
1✔
95
                continue
1✔
96

97
            #  Score the block
98
            position_score = self._score_bottom_position(
1✔
99
                block.bbox, page_bbox, page_number_bbox
100
            )
101

102
            # Skip if not in bottom 20% of page
103
            if position_score == 0.0:
1✔
104
                continue
1✔
105

106
            width_score = self._score_width_coverage(block.bbox, page_bbox)
1✔
107
            aspect_ratio_score = self._score_aspect_ratio(block.bbox)
1✔
108

109
            # Must have minimum width (at least 30% of page width)
110
            if width_score == 0.0:
1✔
111
                continue
1✔
112

113
            # Must have aspect ratio suggesting horizontal bar (at least 3:1)
114
            if aspect_ratio_score == 0.0:
1✔
115
                continue
1✔
116

117
            # Clip the bbox to page boundaries
118
            original_width = block.bbox.width
1✔
119
            clipped_bbox = block.bbox.clip_to(page_bbox)
1✔
120

121
            score_details = _ProgressBarScore(
1✔
122
                position_score=position_score,
123
                width_score=width_score,
124
                aspect_ratio_score=aspect_ratio_score,
125
                original_width=original_width,
126
                clipped_bbox=clipped_bbox,
127
            )
128

129
            combined = score_details.combined_score(self.config)
1✔
130

131
            # Store candidate WITHOUT construction
132
            result.add_candidate(
1✔
133
                "progress_bar",
134
                Candidate(
135
                    bbox=clipped_bbox,
136
                    label="progress_bar",
137
                    score=combined,
138
                    score_details=score_details,
139
                    constructed=None,
140
                    source_blocks=[block],
141
                    failure_reason=None,
142
                ),
143
            )
144

145
    def construct(self, result: ClassificationResult) -> None:
1✔
146
        """Construct ProgressBar elements from candidates."""
147
        candidates = result.get_candidates("progress_bar")
1✔
148
        for candidate in candidates:
1✔
149
            try:
1✔
150
                elem = self._construct_single(candidate, result)
1✔
151
                candidate.constructed = elem
1✔
UNCOV
152
            except Exception as e:
×
UNCOV
153
                candidate.failure_reason = str(e)
×
154

155
    def _construct_single(
1✔
156
        self, candidate: Candidate, result: ClassificationResult
157
    ) -> LegoPageElements:
158
        """Construct a ProgressBar element from a single candidate."""
159
        # Get score details
160
        detail_score = candidate.score_details
1✔
161
        assert isinstance(detail_score, _ProgressBarScore)
1✔
162

163
        # Construct the ProgressBar element
164
        return ProgressBar(
1✔
165
            bbox=detail_score.clipped_bbox,
166
            progress=None,
167
            full_width=detail_score.original_width,
168
        )
169

170
    def _get_page_number_bbox(self, result: ClassificationResult) -> BBox | None:
1✔
171
        """Get the bbox of the page number if it has been classified."""
172
        page_number_candidates = result.get_scored_candidates("page_number")
1✔
173

174
        # Get the first constructed page number
175
        for candidate in page_number_candidates:
1✔
176
            if candidate.is_valid:
1✔
177
                assert isinstance(candidate.constructed, PageNumber)
1✔
178
                return candidate.bbox
1✔
179

180
        return None
1✔
181

182
    def _score_bottom_position(
1✔
183
        self, bbox: BBox, page_bbox: BBox, page_number_bbox: BBox | None
184
    ) -> float:
185
        """Score based on position at bottom of page.
186

187
        Returns higher scores for elements at the bottom of the page,
188
        especially near the page number.
189
        """
190
        page_height = page_bbox.height
1✔
191
        element_bottom = bbox.y1
1✔
192

193
        # Calculate distance from bottom of page
194
        bottom_distance = page_bbox.y1 - element_bottom
1✔
195
        bottom_margin_ratio = bottom_distance / page_height
1✔
196

197
        # Should be in bottom 20% of page
198
        if bottom_margin_ratio > 0.2:
1✔
199
            return 0.0
1✔
200

201
        # Score based on proximity to bottom (closer = better)
202
        position_score = 1.0 - (bottom_margin_ratio / 0.2)
1✔
203

204
        # Boost score if near page number
205
        if page_number_bbox is not None:
1✔
206
            # Check horizontal distance to page number
207
            horizontal_distance = min(
1✔
208
                abs(bbox.x0 - page_number_bbox.x1),
209
                abs(bbox.x1 - page_number_bbox.x0),
210
            )
211
            if horizontal_distance < page_bbox.width * 0.3:
1✔
212
                position_score = min(1.0, position_score * 1.2)
1✔
213

214
        return min(1.0, position_score)
1✔
215

216
    def _score_width_coverage(self, bbox: BBox, page_bbox: BBox) -> float:
1✔
217
        """Score based on how much of the page width the element spans.
218

219
        Progress bars typically span >50% of the page width.
220
        """
221
        width_ratio = bbox.width / page_bbox.width
1✔
222

223
        # Penalize elements that are too narrow
224
        if width_ratio < 0.3:
1✔
225
            return 0.0
1✔
226

227
        # Score increases with width, maxing at 80% coverage
228
        # (some margin is expected on sides)
229
        if width_ratio >= 0.8:
1✔
230
            return 1.0
1✔
231

232
        # Linear interpolation between 0.3 and 0.8
233
        return (width_ratio - 0.3) / 0.5
1✔
234

235
    def _score_aspect_ratio(self, bbox: BBox) -> float:
1✔
236
        """Score based on aspect ratio (should be wide and thin).
237

238
        Progress bars are typically very wide relative to their height.
239
        """
240
        aspect_ratio = bbox.width / bbox.height if bbox.height > 0 else 0.0
1✔
241

242
        # Progress bars should be wide and thin
243
        # Typical aspect ratio might be 10:1 or higher
244
        if aspect_ratio < 3.0:  # Too square
1✔
245
            return 0.0
1✔
246

247
        if aspect_ratio >= 10.0:  # Good aspect ratio
1✔
248
            return 1.0
1✔
249

250
        # Linear interpolation between 3 and 10
251
        return (aspect_ratio - 3.0) / 7.0
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc