• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19217904329

10 Nov 2025 01:35AM UTC coverage: 87.121% (+0.7%) from 86.426%
19217904329

push

github

bramp
Bumped some dependencies.

4600 of 5280 relevant lines covered (87.12%)

0.87 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.41
/src/build_a_long/pdf_extract/classifier/classifier.py
1
"""
2
Rule-based classifier for labeling page elements.
3

4
Pipeline order and dependencies
5
--------------------------------
6
Classifiers run in a fixed, enforced order because later stages depend on
7
labels produced by earlier stages:
8

9
1) PageNumberClassifier → outputs: "page_number"
10
2) PartCountClassifier  → outputs: "part_count"
11
3) StepNumberClassifier → outputs: "step_number" (uses page_number size)
12
4) PartsClassifier      → outputs: "part" (requires part_count, pairs with images)
13
5) PartsListClassifier  → outputs: "parts_list" (requires step_number and part)
14
6) PartsImageClassifier → outputs: "part_image" (requires parts_list, part_count)
15
7) StepClassifier       → outputs: "step" (requires step_number and parts_list)
16

17
If the order is changed such that a classifier runs before its requirements
18
are available, a ValueError will be raised at initialization time.
19
"""
20

21
from __future__ import annotations
1✔
22

23
import logging
1✔
24

25
from build_a_long.pdf_extract.classifier.block_filter import filter_duplicate_blocks
1✔
26
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
27
    BatchClassificationResult,
28
    ClassificationResult,
29
    ClassifierConfig,
30
    RemovalReason,
31
)
32
from build_a_long.pdf_extract.classifier.font_size_hints import FontSizeHints
1✔
33
from build_a_long.pdf_extract.classifier.page_number_classifier import (
1✔
34
    PageNumberClassifier,
35
)
36
from build_a_long.pdf_extract.classifier.part_count_classifier import (
1✔
37
    PartCountClassifier,
38
)
39
from build_a_long.pdf_extract.classifier.parts_classifier import (
1✔
40
    PartsClassifier,
41
)
42
from build_a_long.pdf_extract.classifier.parts_image_classifier import (
1✔
43
    PartsImageClassifier,
44
)
45
from build_a_long.pdf_extract.classifier.parts_list_classifier import (
1✔
46
    PartsListClassifier,
47
)
48
from build_a_long.pdf_extract.classifier.step_classifier import (
1✔
49
    StepClassifier,
50
)
51
from build_a_long.pdf_extract.classifier.step_number_classifier import (
1✔
52
    StepNumberClassifier,
53
)
54
from build_a_long.pdf_extract.classifier.text_histogram import TextHistogram
1✔
55
from build_a_long.pdf_extract.extractor import PageData
1✔
56
from build_a_long.pdf_extract.extractor.page_blocks import Text
1✔
57

58
logger = logging.getLogger(__name__)
1✔
59

60

61
def classify_elements(page: PageData) -> ClassificationResult:
1✔
62
    """Classify and label elements on a single page using rule-based heuristics.
63

64
    Args:
65
        page: A single PageData object to classify.
66

67
    Returns:
68
        A ClassificationResult object containing the classification results.
69
    """
70
    config = ClassifierConfig()
1✔
71
    classifier = Classifier(config)
1✔
72

73
    return classifier.classify(page)
1✔
74

75

76
def classify_pages(pages: list[PageData]) -> BatchClassificationResult:
1✔
77
    """Classify and label elements across multiple pages using rule-based heuristics.
78

79
    This function performs a three-phase process:
80
    1. Filtering phase: Remove duplicate/similar blocks on each page
81
    2. Analysis phase: Build font size hints from text properties across all pages
82
    3. Classification phase: Use hints to guide element classification
83

84
    Args:
85
        pages: A list of PageData objects to classify.
86

87
    Returns:
88
        BatchClassificationResult containing per-page results and global histogram
89
    """
90
    # Phase 1: Filter duplicate blocks on each page
91
    filtered_pages = []
1✔
92
    for page_data in pages:
1✔
93
        filtered_blocks = filter_duplicate_blocks(page_data.blocks)
1✔
94

95
        logger.debug(
1✔
96
            f"Page {page_data.page_number}: "
97
            f"filtered {len(page_data.blocks) - len(filtered_blocks)} "
98
            f"duplicate blocks"
99
        )
100

101
        # Create a new PageData with filtered blocks
102
        filtered_page = PageData(
1✔
103
            page_number=page_data.page_number,
104
            bbox=page_data.bbox,
105
            blocks=filtered_blocks,
106
        )
107
        filtered_pages.append(filtered_page)
1✔
108

109
    # Phase 2: Extract font size hints from all pages
110
    font_size_hints = FontSizeHints.from_pages(filtered_pages)
1✔
111

112
    # Build histogram for result (keeping for compatibility)
113
    histogram = TextHistogram.from_pages(filtered_pages)
1✔
114

115
    # Phase 3: Classify using the hints
116
    config = ClassifierConfig(font_size_hints=font_size_hints)
1✔
117
    classifier = Classifier(config)
1✔
118

119
    results = []
1✔
120
    for page_data in filtered_pages:
1✔
121
        result = classifier.classify(page_data)
1✔
122
        results.append(result)
1✔
123

124
    return BatchClassificationResult(results=results, histogram=histogram)
1✔
125

126

127
type Classifiers = (
1✔
128
    PageNumberClassifier
129
    | PartCountClassifier
130
    | StepNumberClassifier
131
    | PartsClassifier
132
    | PartsListClassifier
133
    | PartsImageClassifier
134
    | StepClassifier
135
)
136

137

138
class Classifier:
1✔
139
    """
140
    Performs a single run of classification based on rules, configuration, and hints.
141
    This class should be stateless.
142
    """
143

144
    def __init__(self, config: ClassifierConfig):
1✔
145
        self.config = config
1✔
146
        self.classifiers: list[Classifiers] = [
1✔
147
            PageNumberClassifier(config, self),
148
            PartCountClassifier(config, self),
149
            StepNumberClassifier(config, self),
150
            PartsClassifier(config, self),
151
            PartsListClassifier(config, self),
152
            PartsImageClassifier(config, self),
153
            StepClassifier(config, self),
154
        ]
155

156
        # TODO Create a directed graph, and run it in order.
157
        produced: set[str] = set()
1✔
158
        for c in self.classifiers:
1✔
159
            cls = c.__class__
1✔
160
            need = getattr(c, "requires", set())
1✔
161
            if not need.issubset(produced):
1✔
162
                missing = ", ".join(sorted(need - produced))
1✔
163
                raise ValueError(
1✔
164
                    f"Classifier order invalid: {cls.__name__} requires "
165
                    f"labels not yet produced: {missing}"
166
                )
167
            produced |= getattr(c, "outputs", set())
1✔
168

169
    def classify(self, page_data: PageData) -> ClassificationResult:
1✔
170
        """
171
        Runs the classification logic and returns a result.
172
        It does NOT modify page_data directly.
173
        """
174
        result = ClassificationResult(page_data=page_data)
1✔
175

176
        for classifier in self.classifiers:
1✔
177
            classifier.evaluate(page_data, result)
1✔
178
            classifier.classify(page_data, result)
1✔
179

180
        warnings = self._log_post_classification_warnings(page_data, result)
1✔
181
        for warning in warnings:
1✔
182
            result.add_warning(warning)
1✔
183

184
        return result
1✔
185

186
    def _remove_child_bboxes(
1✔
187
        self,
188
        page_data: PageData,
189
        target,
190
        result: ClassificationResult,
191
        keep_ids: set[int] | None = None,
192
    ) -> None:
193
        if keep_ids is None:
1✔
194
            keep_ids = set()
1✔
195

196
        target_bbox = target.bbox
1✔
197

198
        for ele in page_data.blocks:
1✔
199
            if ele is target or id(ele) in keep_ids:
1✔
200
                continue
1✔
201
            b = ele.bbox
1✔
202
            if b.fully_inside(target_bbox):
1✔
203
                result.mark_removed(
×
204
                    ele, RemovalReason(reason_type="child_bbox", target_block=target)
205
                )
206

207
    def _remove_similar_bboxes(
1✔
208
        self,
209
        page_data: PageData,
210
        target,
211
        result: ClassificationResult,
212
        keep_ids: set[int] | None = None,
213
    ) -> None:
214
        if keep_ids is None:
1✔
215
            keep_ids = set()
1✔
216

217
        target_area = target.bbox.area
1✔
218
        tx, ty = target.bbox.center
1✔
219

220
        IOU_THRESHOLD = 0.8
1✔
221
        CENTER_EPS = 1.5
1✔
222
        AREA_TOL = 0.12
1✔
223

224
        for ele in page_data.blocks:
1✔
225
            if ele is target or id(ele) in keep_ids:
1✔
226
                continue
1✔
227

228
            b = ele.bbox
1✔
229
            iou = target.bbox.iou(b)
1✔
230
            if iou >= IOU_THRESHOLD:
1✔
231
                result.mark_removed(
1✔
232
                    ele,
233
                    RemovalReason(reason_type="similar_bbox", target_block=target),
234
                )
235
                continue
1✔
236

237
            cx, cy = b.center
1✔
238
            if abs(cx - tx) <= CENTER_EPS and abs(cy - ty) <= CENTER_EPS:
1✔
239
                area = b.area
×
240
                if (
×
241
                    target_area > 0
242
                    and abs(area - target_area) / target_area <= AREA_TOL
243
                ):
244
                    result.mark_removed(
×
245
                        ele,
246
                        RemovalReason(reason_type="similar_bbox", target_block=target),
247
                    )
248

249
    def _log_post_classification_warnings(
1✔
250
        self, page_data: PageData, result: ClassificationResult
251
    ) -> list[str]:
252
        warnings = []
1✔
253

254
        labeled_blocks = result.get_labeled_blocks()
1✔
255

256
        # Check if there's a page number
257
        has_page_number = any(
1✔
258
            label == "page_number" for label in labeled_blocks.values()
259
        )
260
        if not has_page_number:
1✔
261
            warnings.append(f"Page {page_data.page_number}: missing page number")
1✔
262

263
        # Get elements by label
264
        parts_lists = [
1✔
265
            e for e, label in labeled_blocks.items() if label == "parts_list"
266
        ]
267
        part_counts = [
1✔
268
            e for e, label in labeled_blocks.items() if label == "part_count"
269
        ]
270

271
        for pl in parts_lists:
1✔
272
            inside_counts = [t for t in part_counts if t.bbox.fully_inside(pl.bbox)]
1✔
273
            if not inside_counts:
1✔
274
                warnings.append(
×
275
                    f"Page {page_data.page_number}: parts list at {pl.bbox} "
276
                    f"contains no part counts"
277
                )
278

279
        steps: list[Text] = [
1✔
280
            e
281
            for e, label in labeled_blocks.items()
282
            if label == "step_number" and isinstance(e, Text)
283
        ]
284
        ABOVE_EPS = 2.0
1✔
285
        for step in steps:
1✔
286
            sb = step.bbox
1✔
287
            above = [pl for pl in parts_lists if pl.bbox.y1 <= sb.y0 + ABOVE_EPS]
1✔
288
            if not above:
1✔
289
                warnings.append(
1✔
290
                    f"Page {page_data.page_number}: step number '{step.text}' "
291
                    f"at {sb} has no parts list above it"
292
                )
293
        return warnings
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc