• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19121465813

06 Nov 2025 01:09AM UTC coverage: 85.372% (-0.5%) from 85.858%
19121465813

push

github

bramp
chore: Enabled another linter and applied it.

12 of 32 new or added lines in 3 files covered. (37.5%)

32 existing lines in 4 files now uncovered.

4062 of 4758 relevant lines covered (85.37%)

0.85 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.52
/src/build_a_long/pdf_extract/classifier/classifier.py
1
"""
2
Rule-based classifier for labeling page elements.
3

4
Pipeline order and dependencies
5
--------------------------------
6
Classifiers run in a fixed, enforced order because later stages depend on
7
labels produced by earlier stages:
8

9
1) PageNumberClassifier → outputs: "page_number"
10
2) PartCountClassifier  → outputs: "part_count"
11
3) StepNumberClassifier → outputs: "step_number" (uses page_number size as context)
12
4) PartsListClassifier  → outputs: "parts_list" (requires step_number and part_count)
13
5) PartsImageClassifier → outputs: "part_image" (requires parts_list and part_count)
14
6) StepClassifier       → outputs: "step" (requires step_number and parts_list)
15

16
If the order is changed such that a classifier runs before its requirements
17
are available, a ValueError will be raised at initialization time.
18
"""
19

20
import logging
1✔
21

22
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
23
    ClassificationHints,
24
    ClassificationResult,
25
    ClassifierConfig,
26
    RemovalReason,
27
)
28
from build_a_long.pdf_extract.classifier.page_number_classifier import (
1✔
29
    PageNumberClassifier,
30
)
31
from build_a_long.pdf_extract.classifier.part_count_classifier import (
1✔
32
    PartCountClassifier,
33
)
34
from build_a_long.pdf_extract.classifier.parts_image_classifier import (
1✔
35
    PartsImageClassifier,
36
)
37
from build_a_long.pdf_extract.classifier.parts_list_classifier import (
1✔
38
    PartsListClassifier,
39
)
40
from build_a_long.pdf_extract.classifier.step_classifier import (
1✔
41
    StepClassifier,
42
)
43
from build_a_long.pdf_extract.classifier.step_number_classifier import (
1✔
44
    StepNumberClassifier,
45
)
46
from build_a_long.pdf_extract.extractor import PageData
1✔
47
from build_a_long.pdf_extract.extractor.page_elements import Text
1✔
48

49
logger = logging.getLogger(__name__)
1✔
50

51

52
def classify_elements(page: PageData) -> ClassificationResult:
1✔
53
    """Classify and label elements on a single page using rule-based heuristics.
54

55
    Args:
56
        page: A single PageData object to classify.
57

58
    Returns:
59
        A ClassificationResult object containing the classification results.
60
    """
61
    config = ClassifierConfig()
1✔
62
    classifier = Classifier(config)
1✔
63
    orchestrator = ClassificationOrchestrator(classifier)
1✔
64

65
    return orchestrator.process_page(page)
1✔
66

67

68
def classify_pages(pages: list[PageData]) -> list[ClassificationResult]:
1✔
69
    """Classify and label elements across multiple pages using rule-based heuristics.
70

71
    Args:
72
        pages: A list of PageData objects to classify.
73

74
    Returns:
75
        List of ClassificationResult objects, one per page.
76
    """
77
    config = ClassifierConfig()
1✔
78
    classifier = Classifier(config)
1✔
79
    orchestrator = ClassificationOrchestrator(classifier)
1✔
80

81
    results = []
1✔
82
    for page_data in pages:
1✔
83
        result = orchestrator.process_page(page_data)
1✔
84
        results.append(result)
1✔
85

86
    return results
1✔
87

88

89
class Classifier:
1✔
90
    """
91
    Performs a single run of classification based on rules, configuration, and hints.
92
    This class should be stateless.
93
    """
94

95
    def __init__(self, config: ClassifierConfig):
1✔
96
        self.config = config
1✔
97
        self.classifiers = [
1✔
98
            PageNumberClassifier(config, self),
99
            PartCountClassifier(config, self),
100
            StepNumberClassifier(config, self),
101
            PartsListClassifier(config, self),
102
            PartsImageClassifier(config, self),
103
            StepClassifier(config, self),
104
        ]
105

106
        produced: set[str] = set()
1✔
107
        for c in self.classifiers:
1✔
108
            cls = c.__class__
1✔
109
            need = getattr(c, "requires", set())
1✔
110
            if not need.issubset(produced):
1✔
111
                missing = ", ".join(sorted(need - produced))
1✔
112
                raise ValueError(
1✔
113
                    f"Classifier order invalid: {cls.__name__} requires labels not yet produced: {missing}"
114
                )
115
            produced |= getattr(c, "outputs", set())
1✔
116

117
    def classify(
1✔
118
        self, page_data: PageData, hints: ClassificationHints | None = None
119
    ) -> ClassificationResult:
120
        """
121
        Runs the classification logic and returns a result.
122
        It does NOT modify page_data directly.
123
        """
124
        result = ClassificationResult(page_data=page_data)
1✔
125

126
        for classifier in self.classifiers:
1✔
127
            classifier.evaluate(page_data, result)
1✔
128
            classifier.classify(page_data, result, hints)
1✔
129

130
        warnings = self._log_post_classification_warnings(page_data, result)
1✔
131
        for warning in warnings:
1✔
132
            result.add_warning(warning)
1✔
133

134
        return result
1✔
135

136
    def _remove_child_bboxes(
1✔
137
        self,
138
        page_data: PageData,
139
        target,
140
        result: ClassificationResult,
141
        keep_ids: set[int] | None = None,
142
    ) -> None:
143
        if keep_ids is None:
1✔
144
            keep_ids = set()
1✔
145

146
        target_bbox = target.bbox
1✔
147

148
        for ele in page_data.elements:
1✔
149
            if ele is target or id(ele) in keep_ids:
1✔
150
                continue
1✔
151
            b = ele.bbox
1✔
152
            if b.fully_inside(target_bbox):
1✔
153
                result.mark_removed(
1✔
154
                    ele, RemovalReason(reason_type="child_bbox", target_element=target)
155
                )
156

157
    def _remove_similar_bboxes(
1✔
158
        self,
159
        page_data: PageData,
160
        target,
161
        result: ClassificationResult,
162
        keep_ids: set[int] | None = None,
163
    ) -> None:
164
        if keep_ids is None:
1✔
165
            keep_ids = set()
1✔
166

167
        target_area = target.bbox.area
1✔
168
        tx, ty = target.bbox.center
1✔
169

170
        IOU_THRESHOLD = 0.8
1✔
171
        CENTER_EPS = 1.5
1✔
172
        AREA_TOL = 0.12
1✔
173

174
        for ele in page_data.elements:
1✔
175
            if ele is target or id(ele) in keep_ids:
1✔
176
                continue
1✔
177

178
            b = ele.bbox
1✔
179
            iou = target.bbox.iou(b)
1✔
180
            if iou >= IOU_THRESHOLD:
1✔
181
                result.mark_removed(
1✔
182
                    ele,
183
                    RemovalReason(reason_type="similar_bbox", target_element=target),
184
                )
185
                continue
1✔
186

187
            cx, cy = b.center
1✔
188
            if abs(cx - tx) <= CENTER_EPS and abs(cy - ty) <= CENTER_EPS:
1✔
UNCOV
189
                area = b.area
×
UNCOV
190
                if (
×
191
                    target_area > 0
192
                    and abs(area - target_area) / target_area <= AREA_TOL
193
                ):
UNCOV
194
                    result.mark_removed(
×
195
                        ele,
196
                        RemovalReason(
197
                            reason_type="similar_bbox", target_element=target
198
                        ),
199
                    )
200

201
    def _log_post_classification_warnings(
1✔
202
        self, page_data: PageData, result: ClassificationResult
203
    ) -> list[str]:
204
        warnings = []
1✔
205

206
        labeled_elements = result.get_labeled_elements()
1✔
207

208
        # Check if there's a page number
209
        has_page_number = any(
1✔
210
            label == "page_number" for label in labeled_elements.values()
211
        )
212
        if not has_page_number:
1✔
213
            warnings.append(f"Page {page_data.page_number}: missing page number")
1✔
214

215
        # Get elements by label
216
        parts_lists = [
1✔
217
            e for e, label in labeled_elements.items() if label == "parts_list"
218
        ]
219
        part_counts = [
1✔
220
            e for e, label in labeled_elements.items() if label == "part_count"
221
        ]
222

223
        for pl in parts_lists:
1✔
224
            inside_counts = [t for t in part_counts if t.bbox.fully_inside(pl.bbox)]
1✔
225
            if not inside_counts:
1✔
UNCOV
226
                warnings.append(
×
227
                    f"Page {page_data.page_number}: parts list at {pl.bbox} contains no part counts"
228
                )
229

230
        steps: list[Text] = [
1✔
231
            e
232
            for e, label in labeled_elements.items()
233
            if label == "step_number" and isinstance(e, Text)
234
        ]
235
        ABOVE_EPS = 2.0
1✔
236
        for step in steps:
1✔
237
            sb = step.bbox
1✔
238
            above = [pl for pl in parts_lists if pl.bbox.y1 <= sb.y0 + ABOVE_EPS]
1✔
239
            if not above:
1✔
240
                warnings.append(
1✔
241
                    f"Page {page_data.page_number}: step number '{step.text}' at {sb} has no parts list above it"
242
                )
243
        return warnings
1✔
244

245

246
class ClassificationOrchestrator:
1✔
247
    """
248
    Manages the backtracking classification process.
249
    This class is stateful.
250
    """
251

252
    def __init__(self, classifier: Classifier):
1✔
253
        self.classifier = classifier
1✔
254
        self.history: list[ClassificationResult] = []
1✔
255

256
    def process_page(self, page_data: PageData) -> ClassificationResult:
1✔
257
        """
258
        Orchestrates the classification of a single page, with backtracking.
259

260
        Returns:
261
            The final ClassificationResult containing labels and removal info.
262
        """
263
        hints = ClassificationHints()
1✔
264

265
        max_iterations = 1  # TODO raise this in future
1✔
266
        for i in range(max_iterations):
1✔
267
            result = self.classifier.classify(page_data, hints)
1✔
268
            self.history.append(result)
1✔
269

270
            inconsistencies = result.get_warnings()
1✔
271
            if not inconsistencies:
1✔
272
                return result
1✔
273

274
            hints = self._generate_new_hints(result, inconsistencies)
1✔
275

276
        final_result = self.history[-1]
1✔
277
        return final_result
1✔
278

279
    def _generate_new_hints(
1✔
280
        self, result: ClassificationResult, inconsistencies: list[str]
281
    ) -> ClassificationHints:
282
        """
283
        Creates new hints to guide the next classification run.
284
        """
285
        # TODO Here, we should look at the font sizes, and use that to help
286
        # bias the next run towards more consistent results.
287
        # TODO Figure out how to pass the hints between pages (of the book).
288
        return ClassificationHints()
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc