• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19221936920

10 Nov 2025 05:39AM UTC coverage: 86.707% (-0.4%) from 87.11%
19221936920

push

github

bramp
fix(pdf_extract): Fix argparse prefix matching and improve boolean flags

- Add allow_abbrev=False to prevent --draw from matching --draw-deleted
- Change to BooleanOptionalAction for --summary, --draw, and --draw-deleted
  This provides both positive and negative forms (e.g., --draw/--no-draw)
- Remove set_defaults() call since defaults are now in argument definitions
- Fixes bug where deleted blocks were drawn without --draw-deleted flag

4253 of 4905 relevant lines covered (86.71%)

0.87 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.45
/src/build_a_long/pdf_extract/classifier/classifier.py
1
"""
2
Rule-based classifier for labeling page elements.
3

4
Pipeline order and dependencies
5
--------------------------------
6
Classifiers run in a fixed, enforced order because later stages depend on
7
labels produced by earlier stages:
8

9
1) PageNumberClassifier → outputs: "page_number"
10
2) PartCountClassifier  → outputs: "part_count"
11
3) StepNumberClassifier → outputs: "step_number" (uses page_number size)
12
4) PartsClassifier      → outputs: "part" (requires part_count, pairs with images)
13
5) PartsListClassifier  → outputs: "parts_list" (requires part)
14
6) PartsImageClassifier → outputs: "part_image" (requires parts_list, part_count)
15
7) StepClassifier       → outputs: "step" (requires step_number and parts_list)
16
8) PageClassifier       → outputs: "page" (requires page_number and step)
17

18
If the order is changed such that a classifier runs before its requirements
19
are available, a ValueError will be raised at initialization time.
20
"""
21

22
from __future__ import annotations
1✔
23

24
import logging
1✔
25

26
from build_a_long.pdf_extract.classifier.block_filter import filter_duplicate_blocks
1✔
27
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
28
    BatchClassificationResult,
29
    ClassificationResult,
30
    ClassifierConfig,
31
    RemovalReason,
32
)
33
from build_a_long.pdf_extract.classifier.font_size_hints import FontSizeHints
1✔
34
from build_a_long.pdf_extract.classifier.page_classifier import PageClassifier
1✔
35
from build_a_long.pdf_extract.classifier.page_number_classifier import (
1✔
36
    PageNumberClassifier,
37
)
38
from build_a_long.pdf_extract.classifier.part_count_classifier import (
1✔
39
    PartCountClassifier,
40
)
41
from build_a_long.pdf_extract.classifier.parts_classifier import (
1✔
42
    PartsClassifier,
43
)
44
from build_a_long.pdf_extract.classifier.parts_image_classifier import (
1✔
45
    PartsImageClassifier,
46
)
47
from build_a_long.pdf_extract.classifier.parts_list_classifier import (
1✔
48
    PartsListClassifier,
49
)
50
from build_a_long.pdf_extract.classifier.step_classifier import (
1✔
51
    StepClassifier,
52
)
53
from build_a_long.pdf_extract.classifier.step_number_classifier import (
1✔
54
    StepNumberClassifier,
55
)
56
from build_a_long.pdf_extract.classifier.text_histogram import TextHistogram
1✔
57
from build_a_long.pdf_extract.extractor import PageData
1✔
58
from build_a_long.pdf_extract.extractor.page_blocks import Text
1✔
59

60
logger = logging.getLogger(__name__)
1✔
61

62

63
def classify_elements(page: PageData) -> ClassificationResult:
1✔
64
    """Classify and label elements on a single page using rule-based heuristics.
65

66
    Args:
67
        page: A single PageData object to classify.
68

69
    Returns:
70
        A ClassificationResult object containing the classification results.
71
    """
72
    config = ClassifierConfig()
1✔
73
    classifier = Classifier(config)
1✔
74

75
    return classifier.classify(page)
1✔
76

77

78
def classify_pages(pages: list[PageData]) -> BatchClassificationResult:
1✔
79
    """Classify and label elements across multiple pages using rule-based heuristics.
80

81
    This function performs a three-phase process:
82
    1. Filtering phase: Remove duplicate/similar blocks on each page
83
    2. Analysis phase: Build font size hints from text properties across all pages
84
    3. Classification phase: Use hints to guide element classification
85

86
    Args:
87
        pages: A list of PageData objects to classify.
88

89
    Returns:
90
        BatchClassificationResult containing per-page results and global histogram
91
    """
92
    # Phase 1: Filter duplicate blocks on each page
93
    filtered_pages = []
1✔
94
    for page_data in pages:
1✔
95
        filtered_blocks = filter_duplicate_blocks(page_data.blocks)
1✔
96

97
        logger.debug(
1✔
98
            f"Page {page_data.page_number}: "
99
            f"filtered {len(page_data.blocks) - len(filtered_blocks)} "
100
            f"duplicate blocks"
101
        )
102

103
        # Create a new PageData with filtered blocks
104
        filtered_page = PageData(
1✔
105
            page_number=page_data.page_number,
106
            bbox=page_data.bbox,
107
            blocks=filtered_blocks,
108
        )
109
        filtered_pages.append(filtered_page)
1✔
110

111
    # Phase 2: Extract font size hints from all pages
112
    font_size_hints = FontSizeHints.from_pages(filtered_pages)
1✔
113

114
    # Build histogram for result (keeping for compatibility)
115
    histogram = TextHistogram.from_pages(filtered_pages)
1✔
116

117
    # Phase 3: Classify using the hints
118
    config = ClassifierConfig(font_size_hints=font_size_hints)
1✔
119
    classifier = Classifier(config)
1✔
120

121
    results = []
1✔
122
    for page_data in filtered_pages:
1✔
123
        result = classifier.classify(page_data)
1✔
124
        results.append(result)
1✔
125

126
    return BatchClassificationResult(results=results, histogram=histogram)
1✔
127

128

129
type Classifiers = (
1✔
130
    PageNumberClassifier
131
    | PartCountClassifier
132
    | StepNumberClassifier
133
    | PartsClassifier
134
    | PartsListClassifier
135
    | PartsImageClassifier
136
    | StepClassifier
137
    | PageClassifier
138
)
139

140

141
class Classifier:
1✔
142
    """
143
    Performs a single run of classification based on rules, configuration, and hints.
144
    This class should be stateless.
145
    """
146

147
    def __init__(self, config: ClassifierConfig):
1✔
148
        self.config = config
1✔
149
        self.classifiers: list[Classifiers] = [
1✔
150
            PageNumberClassifier(config, self),
151
            PartCountClassifier(config, self),
152
            StepNumberClassifier(config, self),
153
            PartsClassifier(config, self),
154
            PartsListClassifier(config, self),
155
            PartsImageClassifier(config, self),
156
            StepClassifier(config, self),
157
            PageClassifier(config, self),
158
        ]
159

160
        # TODO Create a directed graph, and run it in order.
161
        produced: set[str] = set()
1✔
162
        for c in self.classifiers:
1✔
163
            cls = c.__class__
1✔
164
            need = getattr(c, "requires", set())
1✔
165
            if not need.issubset(produced):
1✔
166
                missing = ", ".join(sorted(need - produced))
1✔
167
                raise ValueError(
1✔
168
                    f"Classifier order invalid: {cls.__name__} requires "
169
                    f"labels not yet produced: {missing}"
170
                )
171
            produced |= getattr(c, "outputs", set())
1✔
172

173
    def classify(self, page_data: PageData) -> ClassificationResult:
1✔
174
        """
175
        Runs the classification logic and returns a result.
176
        It does NOT modify page_data directly.
177
        """
178
        result = ClassificationResult(page_data=page_data)
1✔
179

180
        for classifier in self.classifiers:
1✔
181
            classifier.evaluate(page_data, result)
1✔
182
            classifier.classify(page_data, result)
1✔
183

184
        warnings = self._log_post_classification_warnings(page_data, result)
1✔
185
        for warning in warnings:
1✔
186
            result.add_warning(warning)
1✔
187

188
        return result
1✔
189

190
    def _remove_child_bboxes(
1✔
191
        self,
192
        page_data: PageData,
193
        target,
194
        result: ClassificationResult,
195
        keep_ids: set[int] | None = None,
196
    ) -> None:
197
        if keep_ids is None:
1✔
198
            keep_ids = set()
1✔
199

200
        target_bbox = target.bbox
1✔
201

202
        for ele in page_data.blocks:
1✔
203
            if ele is target or id(ele) in keep_ids:
1✔
204
                continue
1✔
205
            b = ele.bbox
1✔
206
            if b.fully_inside(target_bbox):
1✔
207
                result.mark_removed(
×
208
                    ele, RemovalReason(reason_type="child_bbox", target_block=target)
209
                )
210

211
    def _remove_similar_bboxes(
1✔
212
        self,
213
        page_data: PageData,
214
        target,
215
        result: ClassificationResult,
216
        keep_ids: set[int] | None = None,
217
    ) -> None:
218
        if keep_ids is None:
1✔
219
            keep_ids = set()
1✔
220

221
        target_area = target.bbox.area
1✔
222
        tx, ty = target.bbox.center
1✔
223

224
        IOU_THRESHOLD = 0.8
1✔
225
        CENTER_EPS = 1.5
1✔
226
        AREA_TOL = 0.12
1✔
227

228
        for ele in page_data.blocks:
1✔
229
            if ele is target or id(ele) in keep_ids:
1✔
230
                continue
1✔
231

232
            b = ele.bbox
1✔
233
            iou = target.bbox.iou(b)
1✔
234
            if iou >= IOU_THRESHOLD:
1✔
235
                result.mark_removed(
1✔
236
                    ele,
237
                    RemovalReason(reason_type="similar_bbox", target_block=target),
238
                )
239
                continue
1✔
240

241
            cx, cy = b.center
1✔
242
            if abs(cx - tx) <= CENTER_EPS and abs(cy - ty) <= CENTER_EPS:
1✔
243
                area = b.area
×
244
                if (
×
245
                    target_area > 0
246
                    and abs(area - target_area) / target_area <= AREA_TOL
247
                ):
248
                    result.mark_removed(
×
249
                        ele,
250
                        RemovalReason(reason_type="similar_bbox", target_block=target),
251
                    )
252

253
    def _log_post_classification_warnings(
1✔
254
        self, page_data: PageData, result: ClassificationResult
255
    ) -> list[str]:
256
        warnings = []
1✔
257

258
        labeled_blocks = result.get_labeled_blocks()
1✔
259

260
        # Check if there's a page number
261
        has_page_number = any(
1✔
262
            label == "page_number" for label in labeled_blocks.values()
263
        )
264
        if not has_page_number:
1✔
265
            warnings.append(f"Page {page_data.page_number}: missing page number")
1✔
266

267
        # Get elements by label
268
        parts_lists = [
1✔
269
            e for e, label in labeled_blocks.items() if label == "parts_list"
270
        ]
271
        part_counts = [
1✔
272
            e for e, label in labeled_blocks.items() if label == "part_count"
273
        ]
274

275
        for pl in parts_lists:
1✔
276
            inside_counts = [t for t in part_counts if t.bbox.fully_inside(pl.bbox)]
1✔
277
            if not inside_counts:
1✔
278
                warnings.append(
×
279
                    f"Page {page_data.page_number}: parts list at {pl.bbox} "
280
                    f"contains no part counts"
281
                )
282

283
        steps: list[Text] = [
1✔
284
            e
285
            for e, label in labeled_blocks.items()
286
            if label == "step_number" and isinstance(e, Text)
287
        ]
288
        ABOVE_EPS = 2.0
1✔
289
        for step in steps:
1✔
290
            sb = step.bbox
1✔
291
            above = [pl for pl in parts_lists if pl.bbox.y1 <= sb.y0 + ABOVE_EPS]
1✔
292
            if not above:
1✔
293
                warnings.append(
1✔
294
                    f"Page {page_data.page_number}: step number '{step.text}' "
295
                    f"at {sb} has no parts list above it"
296
                )
297
        return warnings
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc