• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19058602748

04 Nov 2025 04:38AM UTC coverage: 88.027% (-0.09%) from 88.118%
19058602748

push

github

bramp
Add Step classifier and refactor Candidate to support synthetic elements

- Implement StepClassifier that combines step_number, parts_list, and diagram into Step structures
- Refactor Candidate class to make source_element optional and add required bbox field
- Step is a synthetic/composite element with no single source, so source_element is None
- Update all classifiers to pass bbox explicitly when creating Candidates
- Add comprehensive tests for StepClassifier covering various scenarios
- Update mark_winner and helper methods to handle optional source_element
- Clean up unused step_elements dictionary from StepClassifier
- All tests passing (10/10 classifier test suites)

179 of 181 new or added lines in 5 files covered. (98.9%)

111 existing lines in 10 files now uncovered.

3529 of 4009 relevant lines covered (88.03%)

0.88 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.72
/src/build_a_long/pdf_extract/classifier/classifier.py
1
"""
2
Rule-based classifier for labeling page elements.
3

4
Pipeline order and dependencies
5
--------------------------------
6
Classifiers run in a fixed, enforced order because later stages depend on
7
labels produced by earlier stages:
8

9
1) PageNumberClassifier → outputs: "page_number"
10
2) PartCountClassifier  → outputs: "part_count"
11
3) StepNumberClassifier → outputs: "step_number" (uses page_number size as context)
12
4) PartsListClassifier  → outputs: "parts_list" (requires step_number and part_count)
13
5) PartsImageClassifier → outputs: "part_image" (requires parts_list and part_count)
14
6) StepClassifier       → outputs: "step" (requires step_number and parts_list)
15

16
If the order is changed such that a classifier runs before its requirements
17
are available, a ValueError will be raised at initialization time.
18
"""
19

20
import logging
1✔
21
from typing import List, Optional, Set
1✔
22

23
from build_a_long.pdf_extract.classifier.page_number_classifier import (
1✔
24
    PageNumberClassifier,
25
)
26
from build_a_long.pdf_extract.classifier.part_count_classifier import (
1✔
27
    PartCountClassifier,
28
)
29
from build_a_long.pdf_extract.classifier.parts_list_classifier import (
1✔
30
    PartsListClassifier,
31
)
32
from build_a_long.pdf_extract.classifier.parts_image_classifier import (
1✔
33
    PartsImageClassifier,
34
)
35
from build_a_long.pdf_extract.classifier.step_number_classifier import (
1✔
36
    StepNumberClassifier,
37
)
38
from build_a_long.pdf_extract.classifier.step_classifier import (
1✔
39
    StepClassifier,
40
)
41
from build_a_long.pdf_extract.classifier.types import (
1✔
42
    ClassifierConfig,
43
    ClassificationHints,
44
    ClassificationResult,
45
    RemovalReason,
46
)
47
from build_a_long.pdf_extract.extractor import PageData
1✔
48
from build_a_long.pdf_extract.extractor.page_elements import Text
1✔
49

50
logger = logging.getLogger(__name__)
1✔
51

52

53
def classify_elements(page: PageData) -> ClassificationResult:
1✔
54
    """Classify and label elements on a single page using rule-based heuristics.
55

56
    Args:
57
        page: A single PageData object to classify.
58

59
    Returns:
60
        A ClassificationResult object containing the classification results.
61
    """
62
    config = ClassifierConfig()
1✔
63
    classifier = Classifier(config)
1✔
64
    orchestrator = ClassificationOrchestrator(classifier)
1✔
65

66
    return orchestrator.process_page(page)
1✔
67

68

69
def classify_pages(pages: List[PageData]) -> List[ClassificationResult]:
1✔
70
    """Classify and label elements across multiple pages using rule-based heuristics.
71

72
    Args:
73
        pages: A list of PageData objects to classify.
74

75
    Returns:
76
        List of ClassificationResult objects, one per page.
77
    """
78
    config = ClassifierConfig()
1✔
79
    classifier = Classifier(config)
1✔
80
    orchestrator = ClassificationOrchestrator(classifier)
1✔
81

82
    results = []
1✔
83
    for page_data in pages:
1✔
84
        result = orchestrator.process_page(page_data)
1✔
85
        results.append(result)
1✔
86

87
    return results
1✔
88

89

90
class Classifier:
1✔
91
    """
92
    Performs a single run of classification based on rules, configuration, and hints.
93
    This class should be stateless.
94
    """
95

96
    def __init__(self, config: ClassifierConfig):
1✔
97
        self.config = config
1✔
98
        self.classifiers = [
1✔
99
            PageNumberClassifier(config, self),
100
            PartCountClassifier(config, self),
101
            StepNumberClassifier(config, self),
102
            PartsListClassifier(config, self),
103
            PartsImageClassifier(config, self),
104
            StepClassifier(config, self),
105
        ]
106

107
        produced: Set[str] = set()
1✔
108
        for c in self.classifiers:
1✔
109
            cls = c.__class__
1✔
110
            need = getattr(c, "requires", set())
1✔
111
            if not need.issubset(produced):
1✔
112
                missing = ", ".join(sorted(need - produced))
1✔
113
                raise ValueError(
1✔
114
                    f"Classifier order invalid: {cls.__name__} requires labels not yet produced: {missing}"
115
                )
116
            produced |= getattr(c, "outputs", set())
1✔
117

118
    def classify(
1✔
119
        self, page_data: PageData, hints: Optional[ClassificationHints] = None
120
    ) -> ClassificationResult:
121
        """
122
        Runs the classification logic and returns a result.
123
        It does NOT modify page_data directly.
124
        """
125
        result = ClassificationResult()
1✔
126

127
        for classifier in self.classifiers:
1✔
128
            classifier.evaluate(page_data, result)
1✔
129
            classifier.classify(page_data, result, hints)
1✔
130

131
        warnings = self._log_post_classification_warnings(page_data, result)
1✔
132
        for warning in warnings:
1✔
133
            result.add_warning(warning)
1✔
134

135
        # Extract persisted relations from PartsImageClassifier
136
        part_image_pairs = []
1✔
137
        for classifier in self.classifiers:
1✔
138
            if isinstance(classifier, PartsImageClassifier):
1✔
139
                part_image_pairs = classifier.get_part_image_pairs()
1✔
140
                break
1✔
141

142
        result.part_image_pairs = part_image_pairs
1✔
143

144
        return result
1✔
145

146
    def _remove_child_bboxes(
1✔
147
        self,
148
        page_data: PageData,
149
        target,
150
        result: ClassificationResult,
151
        keep_ids: Optional[Set[int]] = None,
152
    ) -> None:
153
        if keep_ids is None:
1✔
154
            keep_ids = set()
1✔
155

156
        target_bbox = target.bbox
1✔
157

158
        for ele in page_data.elements:
1✔
159
            if ele is target or id(ele) in keep_ids:
1✔
160
                continue
1✔
161
            b = ele.bbox
1✔
162
            if b.fully_inside(target_bbox):
1✔
163
                result.mark_removed(
1✔
164
                    ele, RemovalReason(reason_type="child_bbox", target_element=target)
165
                )
166

167
    def _remove_similar_bboxes(
1✔
168
        self,
169
        page_data: PageData,
170
        target,
171
        result: ClassificationResult,
172
        keep_ids: Optional[Set[int]] = None,
173
    ) -> None:
174
        if keep_ids is None:
1✔
175
            keep_ids = set()
1✔
176

177
        target_area = target.bbox.area
1✔
178
        tx, ty = target.bbox.center
1✔
179

180
        IOU_THRESHOLD = 0.8
1✔
181
        CENTER_EPS = 1.5
1✔
182
        AREA_TOL = 0.12
1✔
183

184
        for ele in page_data.elements:
1✔
185
            if ele is target or id(ele) in keep_ids:
1✔
186
                continue
1✔
187

188
            b = ele.bbox
1✔
189
            iou = target.bbox.iou(b)
1✔
190
            if iou >= IOU_THRESHOLD:
1✔
191
                result.mark_removed(
1✔
192
                    ele,
193
                    RemovalReason(reason_type="similar_bbox", target_element=target),
194
                )
195
                continue
1✔
196

197
            cx, cy = b.center
1✔
198
            if abs(cx - tx) <= CENTER_EPS and abs(cy - ty) <= CENTER_EPS:
1✔
UNCOV
199
                area = b.area
×
UNCOV
200
                if (
×
201
                    target_area > 0
202
                    and abs(area - target_area) / target_area <= AREA_TOL
203
                ):
UNCOV
204
                    result.mark_removed(
×
205
                        ele,
206
                        RemovalReason(
207
                            reason_type="similar_bbox", target_element=target
208
                        ),
209
                    )
210

211
    def _log_post_classification_warnings(
1✔
212
        self, page_data: PageData, result: ClassificationResult
213
    ) -> List[str]:
214
        warnings = []
1✔
215

216
        labeled_elements = result.get_labeled_elements()
1✔
217

218
        # Check if there's a page number
219
        has_page_number = any(
1✔
220
            label == "page_number" for label in labeled_elements.values()
221
        )
222
        if not has_page_number:
1✔
223
            warnings.append(f"Page {page_data.page_number}: missing page number")
1✔
224

225
        # Get elements by label
226
        parts_lists = [
1✔
227
            e for e, label in labeled_elements.items() if label == "parts_list"
228
        ]
229
        part_counts = [
1✔
230
            e for e, label in labeled_elements.items() if label == "part_count"
231
        ]
232

233
        for pl in parts_lists:
1✔
234
            inside_counts = [t for t in part_counts if t.bbox.fully_inside(pl.bbox)]
1✔
235
            if not inside_counts:
1✔
UNCOV
236
                warnings.append(
×
237
                    f"Page {page_data.page_number}: parts list at {pl.bbox} contains no part counts"
238
                )
239

240
        steps: list[Text] = [
1✔
241
            e
242
            for e, label in labeled_elements.items()
243
            if label == "step_number" and isinstance(e, Text)
244
        ]
245
        ABOVE_EPS = 2.0
1✔
246
        for step in steps:
1✔
247
            sb = step.bbox
1✔
248
            above = [pl for pl in parts_lists if pl.bbox.y1 <= sb.y0 + ABOVE_EPS]
1✔
249
            if not above:
1✔
250
                warnings.append(
1✔
251
                    f"Page {page_data.page_number}: step number '{step.text}' at {sb} has no parts list above it"
252
                )
253
        return warnings
1✔
254

255

256
class ClassificationOrchestrator:
1✔
257
    """
258
    Manages the backtracking classification process.
259
    This class is stateful.
260
    """
261

262
    def __init__(self, classifier: Classifier):
1✔
263
        self.classifier = classifier
1✔
264
        self.history: List[ClassificationResult] = []
1✔
265

266
    def process_page(self, page_data: PageData) -> ClassificationResult:
1✔
267
        """
268
        Orchestrates the classification of a single page, with backtracking.
269

270
        Returns:
271
            The final ClassificationResult containing labels and removal info.
272
        """
273
        hints = ClassificationHints()
1✔
274

275
        max_iterations = 1  # TODO raise this in future
1✔
276
        for i in range(max_iterations):
1✔
277
            result = self.classifier.classify(page_data, hints)
1✔
278
            self.history.append(result)
1✔
279

280
            inconsistencies = result.get_warnings()
1✔
281
            if not inconsistencies:
1✔
282
                return result
1✔
283

284
            hints = self._generate_new_hints(result, inconsistencies)
1✔
285

286
        final_result = self.history[-1]
1✔
287
        return final_result
1✔
288

289
    def _generate_new_hints(
1✔
290
        self, result: ClassificationResult, inconsistencies: List[str]
291
    ) -> ClassificationHints:
292
        """
293
        Creates new hints to guide the next classification run.
294
        """
295
        # TODO Here, we should look at the font sizes, and use that to help
296
        # bias the next run towards more consistent results.
297
        # TODO Figure out how to pass the hints between pages (of the book).
298
        return ClassificationHints()
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc