• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19155446196

06 Nov 2025 04:44AM UTC coverage: 85.36% (-0.02%) from 85.381%
19155446196

push

github

bramp
refactor: complete Block rename and terminology cleanup

Renamed page_elements.py → page_blocks.py and systematically updated all
references to use 'block' terminology for raw PDF primitives throughout
the codebase.

Key changes:
- Renamed Element class → Block in page_blocks.py
- Updated all imports and type references across 40+ files
- Renamed internal variables and method parameters:
  - _element_winners → _block_winners
  - _validate_element_in_page_data() → _validate_block_in_page_data()
  - element_to_labels → block_to_labels
  - total_elements → total_blocks
  - And many more variable renames in main.py, tests, and classifiers
- Updated all docstrings, comments, and error messages
- Updated JSON fixtures to use 'blocks' instead of 'elements'
- Updated documentation (README files)

Terminology is now consistent:
- Block = raw PDF primitive (Text, Image, Drawing from pymupdf)
- Element = LEGO semantic component (Part, StepNumber, PartsList, etc.)

All 20 tests passing.

472 of 535 new or added lines in 34 files covered. (88.22%)

6 existing lines in 3 files now uncovered.

4064 of 4761 relevant lines covered (85.36%)

0.85 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.5
/src/build_a_long/pdf_extract/classifier/step_number_classifier.py
1
"""
2
Step number classifier.
3
"""
4

5
from dataclasses import dataclass
1✔
6

7
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
8
    Candidate,
9
    ClassificationHints,
10
    ClassificationResult,
11
    ClassifierConfig,
12
)
13
from build_a_long.pdf_extract.classifier.label_classifier import (
1✔
14
    LabelClassifier,
15
)
16
from build_a_long.pdf_extract.classifier.text_extractors import (
1✔
17
    extract_step_number_value,
18
)
19
from build_a_long.pdf_extract.extractor import PageData
1✔
20
from build_a_long.pdf_extract.extractor.lego_page_elements import StepNumber
1✔
21
from build_a_long.pdf_extract.extractor.page_blocks import Text
1✔
22

23

24
@dataclass
1✔
25
class _StepNumberScore:
1✔
26
    """Internal score representation for step number classification."""
27

28
    text_score: float
29
    """Score based on how well the text matches step number patterns (0.0-1.0)."""
1✔
30

31
    size_score: float
32
    """Score based on element height relative to page number height (0.0-1.0)."""
1✔
33

34
    def combined_score(self, config: ClassifierConfig) -> float:
1✔
35
        """Calculate final weighted score from components."""
36
        # Sum the weighted components
37
        score = (
1✔
38
            config.step_number_text_weight * self.text_score
39
            + config.step_number_size_weight * self.size_score
40
        )
41
        # Normalize by the sum of weights to keep score in [0, 1]
42
        total_weight = config.step_number_text_weight + config.step_number_size_weight
1✔
43
        return score / total_weight if total_weight > 0 else 0.0
1✔
44

45

46
class StepNumberClassifier(LabelClassifier):
1✔
47
    """Classifier for step numbers."""
48

49
    outputs = {"step_number"}
1✔
50
    requires = {"page_number"}
1✔
51

52
    def _score_step_number_size(
1✔
53
        self, element: Text, page_num_height: float | None
54
    ) -> float:
55
        """Score based on element height relative to page number height.
56

57
        Returns 0.0 if element is not significantly taller than page number,
58
        scaling up to 1.0 as element gets taller.
59
        """
60
        if not page_num_height or page_num_height <= 0.0:
1✔
61
            return 0.0
1✔
62

63
        h = element.bbox.height
1✔
64
        if h <= page_num_height * 1.1:
1✔
65
            return 0.0
1✔
66

67
        ratio_over = (h / page_num_height) - 1.0
1✔
68
        return max(0.0, min(1.0, ratio_over / 0.5))
1✔
69

70
    def evaluate(
1✔
71
        self,
72
        page_data: PageData,
73
        result: ClassificationResult,
74
    ) -> None:
75
        """Evaluate elements and create candidates for step numbers.
76

77
        This method scores each text element, attempts to construct StepNumber objects,
78
        and stores all candidates with their scores and any failure reasons.
79
        """
80
        if not page_data.blocks:
1✔
81
            return
1✔
82

83
        page_num_height: float | None = None
1✔
84
        # Find the page_number block to use for size comparison
85
        labeled_blocks = result.get_labeled_blocks()
1✔
86
        for block in page_data.blocks:
1✔
87
            if labeled_blocks.get(block) == "page_number":
1✔
88
                page_num_height = block.bbox.height
1✔
89
                break
1✔
90

91
        # Get page bbox and height for bottom band check
92
        page_bbox = page_data.bbox
1✔
93
        assert page_bbox is not None
1✔
94
        page_height = page_bbox.height
1✔
95

96
        for block in page_data.blocks:
1✔
97
            if not isinstance(block, Text):
1✔
98
                continue
1✔
99

100
            # Skip blocks in the bottom 10% of the page where page numbers typically appear
101
            block_center_y = (block.bbox.y0 + block.bbox.y1) / 2
1✔
102
            bottom_threshold = page_bbox.y1 - (page_height * 0.1)
1✔
103
            if block_center_y >= bottom_threshold:
1✔
104
                continue
1✔
105

106
            text_score = self._score_step_number_text(block.text)
1✔
107
            if text_score == 0.0:
1✔
108
                continue
1✔
109

110
            size_score = self._score_step_number_size(block, page_num_height)
1✔
111

112
            # If we have a page number for size comparison, require the block to be
113
            # taller than the page number (size_score > 0). This prevents small
114
            # numeric text from being classified as step numbers.
115
            if page_num_height and size_score == 0.0:
1✔
116
                continue
1✔
117

118
            # Store detailed score object
119
            detail_score = _StepNumberScore(
1✔
120
                text_score=text_score,
121
                size_score=size_score,
122
            )
123

124
            # Try to construct (parse step number value)
125
            value = extract_step_number_value(block.text)
1✔
126
            constructed_elem = None
1✔
127
            failure_reason = None
1✔
128

129
            if value is not None:
1✔
130
                constructed_elem = StepNumber(
1✔
131
                    value=value,
132
                    bbox=block.bbox,
133
                )
134
            else:
135
                failure_reason = (
×
136
                    f"Could not parse step number from text: '{block.text}'"
137
                )
138

139
            # Add candidate
140
            result.add_candidate(
1✔
141
                "step_number",
142
                Candidate(
143
                    bbox=block.bbox,
144
                    label="step_number",
145
                    score=detail_score.combined_score(self.config),
146
                    score_details=detail_score,
147
                    constructed=constructed_elem,
148
                    source_block=block,
149
                    failure_reason=failure_reason,
150
                    is_winner=False,  # Will be set by classify()
151
                ),
152
            )
153

154
    def classify(
1✔
155
        self,
156
        page_data: PageData,
157
        result: ClassificationResult,
158
        hints: ClassificationHints | None,
159
    ) -> None:
160
        """Select winning step numbers from pre-built candidates."""
161
        # Get pre-built candidates
162
        candidate_list = result.get_candidates("step_number")
1✔
163

164
        # Find the page number block to avoid classifying it as a step number
165
        page_number_blocks = result.get_blocks_by_label("page_number")
1✔
166
        page_number_block = page_number_blocks[0] if page_number_blocks else None
1✔
167

168
        # Mark winners (all successfully constructed candidates that aren't the page number
169
        # and meet the confidence threshold)
170
        for candidate in candidate_list:
1✔
171
            if candidate.source_block is page_number_block:
1✔
172
                # Don't classify page number as step number
NEW
173
                candidate.failure_reason = "Block is already labeled as page_number"
×
174
                continue
×
175

176
            if candidate.score < self.config.min_confidence_threshold:
1✔
177
                candidate.failure_reason = (
×
178
                    f"Score {candidate.score:.2f} below threshold "
179
                    f"{self.config.min_confidence_threshold}"
180
                )
181
                continue
×
182

183
            if candidate.constructed is None:
1✔
184
                # Already has failure_reason from calculate_scores
185
                continue
×
186

187
            # This is a winner!
188
            assert isinstance(candidate.constructed, StepNumber)
1✔
189
            result.mark_winner(candidate, candidate.constructed)
1✔
190
            self.classifier._remove_child_bboxes(
1✔
191
                page_data, candidate.source_block, result
192
            )
193
            self.classifier._remove_similar_bboxes(
1✔
194
                page_data, candidate.source_block, result
195
            )
196

197
    def _score_step_number_text(self, text: str) -> float:
1✔
198
        """Score text based on how well it matches step number patterns.
199

200
        Returns:
201
            1.0 if text matches step number pattern, 0.0 otherwise
202
        """
203
        # Use the extraction function to validate format
204
        if extract_step_number_value(text) is not None:
1✔
205
            return 1.0
1✔
206
        return 0.0
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc