• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19796282858

30 Nov 2025 08:32AM UTC coverage: 90.603% (-0.04%) from 90.646%
19796282858

push

github

bramp
test: update golden files for classifier with hints

Regenerated golden files using classifier config with font_hints and
page_hints for improved classification accuracy.

9835 of 10855 relevant lines covered (90.6%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.07
/src/build_a_long/pdf_extract/classifier/classifier.py
1
"""
2
Rule-based classifier for labeling page elements.
3

4
Pipeline order and dependencies
5
--------------------------------
6
The classification pipeline operates in two main phases:
7

8
1. **Bottom-up Scoring**: All classifiers run independently to identify potential
9
   candidates (e.g. page numbers, part counts, step numbers) and score them based
10
   on heuristics. No construction of final elements happens here.
11

12
2. **Top-down Construction**: The root `PageClassifier` is invoked to construct
13
   the final `Page` object. It recursively requests the construction of its
14
   dependencies (e.g. "Give me the best PageNumber"), which in turn construct
15
   their own dependencies. This ensures a consistent and validated object tree.
16

17
"""
18

19
from __future__ import annotations
1✔
20

21
import logging
1✔
22

23
from build_a_long.pdf_extract.classifier.bags import (
1✔
24
    BagNumberClassifier,
25
    NewBagClassifier,
26
)
27
from build_a_long.pdf_extract.classifier.batch_classification_result import (
1✔
28
    BatchClassificationResult,
29
)
30
from build_a_long.pdf_extract.classifier.block_filter import (
1✔
31
    filter_background_blocks,
32
    filter_duplicate_blocks,
33
    filter_overlapping_text_blocks,
34
)
35
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
36
    ClassificationResult,
37
)
38
from build_a_long.pdf_extract.classifier.classifier_config import ClassifierConfig
1✔
39
from build_a_long.pdf_extract.classifier.pages import (
1✔
40
    PageHintCollection,
41
)
42
from build_a_long.pdf_extract.classifier.pages.page_classifier import PageClassifier
1✔
43
from build_a_long.pdf_extract.classifier.pages.page_number_classifier import (
1✔
44
    PageNumberClassifier,
45
)
46
from build_a_long.pdf_extract.classifier.pages.progress_bar_classifier import (
1✔
47
    ProgressBarClassifier,
48
)
49
from build_a_long.pdf_extract.classifier.parts import (
1✔
50
    PartCountClassifier,
51
    PartNumberClassifier,
52
    PartsClassifier,
53
    PartsImageClassifier,
54
    PartsListClassifier,
55
    PieceLengthClassifier,
56
    ShineClassifier,
57
)
58
from build_a_long.pdf_extract.classifier.removal_reason import RemovalReason
1✔
59
from build_a_long.pdf_extract.classifier.steps import (
1✔
60
    ArrowClassifier,
61
    DiagramClassifier,
62
    RotationSymbolClassifier,
63
    StepClassifier,
64
    StepCountClassifier,
65
    StepNumberClassifier,
66
    SubStepClassifier,
67
)
68
from build_a_long.pdf_extract.classifier.text import FontSizeHints, TextHistogram
1✔
69
from build_a_long.pdf_extract.classifier.topological_sort import topological_sort
1✔
70
from build_a_long.pdf_extract.extractor import PageData
1✔
71
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
72
    PageNumber,
73
    PartCount,
74
    PartsList,
75
    StepNumber,
76
)
77
from build_a_long.pdf_extract.extractor.page_blocks import Blocks
1✔
78

79
logger = logging.getLogger(__name__)
1✔
80

81
# Pages with more blocks than this threshold will be skipped during classification.
82
# This avoids O(n²) algorithms (like duplicate detection) that become prohibitively
83
# slow on pages with thousands of vector drawings. Such pages are typically info
84
# pages where each character is a separate vector graphic.
85
# TODO: Add spatial indexing to handle high-block pages efficiently.
86
MAX_BLOCKS_PER_PAGE = 1000
1✔
87

88

89
# TODO require config, so we don't accidentally use default empty config
90
def classify_elements(
1✔
91
    page: PageData, config: ClassifierConfig | None = None
92
) -> ClassificationResult:
93
    """Classify and label elements on a single page using rule-based heuristics.
94

95
    Args:
96
        page: A single PageData object to classify.
97
        config: Optional classifier configuration with font/page hints.
98
            If None, uses default empty configuration (no hints).
99
            For better classification accuracy, pass a config with
100
            FontSizeHints computed from multiple pages of the same PDF.
101

102
    Returns:
103
        A ClassificationResult object containing the classification results.
104
    """
105
    if config is None:
1✔
106
        config = ClassifierConfig()
1✔
107
    classifier = Classifier(config)
1✔
108

109
    return classifier.classify(page)
1✔
110

111

112
def classify_pages(
1✔
113
    pages: list[PageData], pages_for_hints: list[PageData] | None = None
114
) -> BatchClassificationResult:
115
    """Classify and label elements across multiple pages using rule-based heuristics.
116

117
    This function performs a three-phase process:
118
    1. Filtering phase: Mark duplicate/similar blocks as removed on each page
119
    2. Analysis phase: Build font size hints from text properties (excluding
120
       removed blocks)
121
    3. Classification phase: Use hints to guide element classification
122

123
    Args:
124
        pages: A list of PageData objects to classify.
125
        pages_for_hints: Optional list of pages to use for generating font/page hints.
126
            If None, uses `pages`. This allows generating hints from all pages
127
            while only classifying a subset (e.g., when using --pages filter).
128

129
    Returns:
130
        BatchClassificationResult containing per-page results and global histogram
131
    """
132

133
    # TODO There is a bunch of duplication in here between hints and non-hints. Refactor
134

135
    # Use all pages for hint generation if provided, otherwise use selected pages
136
    hint_pages = pages_for_hints if pages_for_hints is not None else pages
1✔
137

138
    # Phase 1: Filter duplicate blocks on each page and track removals
139
    # Skip pages with too many blocks to avoid O(n²) performance issues
140
    removed_blocks_per_page: list[dict[Blocks, RemovalReason]] = []
1✔
141
    skipped_pages: set[int] = set()  # Track page numbers that are skipped
1✔
142

143
    for page_data in pages:
1✔
144
        # Skip pages with too many blocks - these are likely info/inventory pages
145
        # with vectorized text that cause O(n²) algorithms to be very slow
146
        if len(page_data.blocks) > MAX_BLOCKS_PER_PAGE:
1✔
147
            logger.debug(
1✔
148
                f"Page {page_data.page_number}: skipping classification "
149
                f"({len(page_data.blocks)} blocks exceeds threshold of "
150
                f"{MAX_BLOCKS_PER_PAGE})"
151
            )
152
            skipped_pages.add(page_data.page_number)
1✔
153
            removed_blocks_per_page.append({})
1✔
154
            continue
1✔
155

156
        kept_blocks = page_data.blocks
1✔
157

158
        # Filter background blocks (full page blocks like background images)
159
        kept_blocks, background_removed = filter_background_blocks(
1✔
160
            kept_blocks, page_data.bbox.width, page_data.bbox.height
161
        )
162

163
        # Filter overlapping text blocks (e.g., "4" and "43" at same origin)
164
        kept_blocks, text_removed = filter_overlapping_text_blocks(kept_blocks)
1✔
165

166
        # Filter duplicate image/drawing blocks based on IOU
167
        kept_blocks, bbox_removed = filter_duplicate_blocks(kept_blocks)
1✔
168

169
        # Combine all removal mappings into a single dict for this page
170
        combined_removed_mapping = {
1✔
171
            **text_removed,
172
            **bbox_removed,
173
            **background_removed,
174
        }
175

176
        logger.debug(
1✔
177
            f"Page {page_data.page_number}: "
178
            f"filtered {len(text_removed)} overlapping text, "
179
            f"{len(bbox_removed)} duplicate bbox blocks, "
180
            f"{len(background_removed)} background blocks"
181
        )
182

183
        removed_blocks_per_page.append(combined_removed_mapping)
1✔
184

185
    # Phase 2: Extract font size hints from hint pages (excluding removed blocks)
186
    # Build pages with non-removed blocks for hint extraction and histogram
187

188
    # Filter duplicates from hint pages (may be different from pages to classify)
189
    hint_pages_without_duplicates = []
1✔
190
    for page_data in hint_pages:
1✔
191
        # Skip high-block pages for hints too (same threshold)
192
        if len(page_data.blocks) > MAX_BLOCKS_PER_PAGE:
1✔
193
            continue
1✔
194

195
        # TODO We are re-filtering duplicates here; optimize by changing the API
196
        # to accept one list of PageData, and seperate by page_numbers.
197
        kept_blocks = page_data.blocks
1✔
198
        kept_blocks, _ = filter_background_blocks(
1✔
199
            kept_blocks, page_data.bbox.width, page_data.bbox.height
200
        )
201
        kept_blocks, _ = filter_overlapping_text_blocks(kept_blocks)
1✔
202
        kept_blocks, _ = filter_duplicate_blocks(kept_blocks)
1✔
203

204
        hint_pages_without_duplicates.append(
1✔
205
            PageData(
206
                page_number=page_data.page_number,
207
                bbox=page_data.bbox,
208
                blocks=kept_blocks,
209
            )
210
        )
211

212
    # Build pages without duplicates for classification
213
    pages_without_duplicates = []
1✔
214
    for page_data, removed_mapping in zip(pages, removed_blocks_per_page, strict=True):
1✔
215
        # We need to filter blocks that were removed by ANY filter
216
        non_removed_blocks = [
1✔
217
            block for block in page_data.blocks if block not in removed_mapping
218
        ]
219
        pages_without_duplicates.append(
1✔
220
            PageData(
221
                page_number=page_data.page_number,
222
                bbox=page_data.bbox,
223
                blocks=non_removed_blocks,
224
            )
225
        )
226

227
    # Generate hints from hint pages, histogram from pages to classify
228
    font_size_hints = FontSizeHints.from_pages(hint_pages_without_duplicates)
1✔
229
    page_hints = PageHintCollection.from_pages(hint_pages_without_duplicates)
1✔
230
    histogram = TextHistogram.from_pages(pages_without_duplicates)
1✔
231

232
    # Phase 3: Classify using the hints (on pages without duplicates)
233
    config = ClassifierConfig(font_size_hints=font_size_hints, page_hints=page_hints)
1✔
234
    classifier = Classifier(config)
1✔
235

236
    results = []
1✔
237
    for page_data, page_without_duplicates, removed_mapping in zip(
1✔
238
        pages, pages_without_duplicates, removed_blocks_per_page, strict=True
239
    ):
240
        # Handle skipped pages
241
        if page_data.page_number in skipped_pages:
1✔
242
            result = ClassificationResult(
1✔
243
                page_data=page_data,
244
                skipped_reason=(
245
                    f"Page has {len(page_data.blocks)} blocks, which exceeds "
246
                    f"the threshold of {MAX_BLOCKS_PER_PAGE}. This is likely an "
247
                    f"info/inventory page with vectorized text."
248
                ),
249
            )
250
            results.append(result)
1✔
251
            continue
1✔
252

253
        # Classify using only non-removed blocks
254
        result = classifier.classify(page_without_duplicates)
1✔
255

256
        # Update result to use original page_data (with all blocks)
257
        result.page_data = page_data
1✔
258

259
        # Mark removed blocks
260
        for removed_block, removal_reason in removed_mapping.items():
1✔
261
            result.mark_removed(removed_block, removal_reason)
1✔
262

263
        results.append(result)
1✔
264

265
    return BatchClassificationResult(results=results, histogram=histogram)
1✔
266

267

268
type Classifiers = (
1✔
269
    PageNumberClassifier
270
    | ProgressBarClassifier
271
    | BagNumberClassifier
272
    | PartCountClassifier
273
    | PartNumberClassifier
274
    | StepNumberClassifier
275
    | StepCountClassifier
276
    | PieceLengthClassifier
277
    | PartsClassifier
278
    | PartsListClassifier
279
    | PartsImageClassifier
280
    | ShineClassifier
281
    | NewBagClassifier
282
    | DiagramClassifier
283
    | ArrowClassifier
284
    | SubStepClassifier
285
    | StepClassifier
286
    | PageClassifier
287
)
288

289

290
class Classifier:
1✔
291
    """
292
    Performs a single run of classification based on rules, configuration, and hints.
293
    This class should be stateless.
294
    """
295

296
    def __init__(self, config: ClassifierConfig):
1✔
297
        self.config = config
1✔
298
        # Sort classifiers topologically based on their dependencies
299
        self.classifiers = topological_sort(
1✔
300
            [
301
                PageNumberClassifier(config),
302
                ProgressBarClassifier(config),
303
                BagNumberClassifier(config),
304
                PartCountClassifier(config),
305
                PartNumberClassifier(config),
306
                StepNumberClassifier(config),
307
                StepCountClassifier(config),
308
                PieceLengthClassifier(config),
309
                PartsClassifier(config),
310
                PartsListClassifier(config),
311
                DiagramClassifier(config),
312
                RotationSymbolClassifier(config),
313
                ArrowClassifier(config),
314
                PartsImageClassifier(config),
315
                ShineClassifier(config),
316
                NewBagClassifier(config),
317
                SubStepClassifier(config),
318
                StepClassifier(config),
319
                PageClassifier(config),
320
            ]
321
        )
322

323
    def classify(self, page_data: PageData) -> ClassificationResult:
1✔
324
        """
325
        Runs the classification logic and returns a result.
326
        It does NOT modify page_data directly.
327

328
        The classification process runs in three phases:
329
        1. Score all classifiers (bottom-up) - auto-registers classifiers
330
        2. Construct final elements (top-down starting from Page)
331
        """
332
        result = ClassificationResult(page_data=page_data)
1✔
333

334
        logger.debug(f"Starting classification for page {page_data.page_number}")
1✔
335

336
        # 1. Score all classifiers (Bottom-Up)
337
        # Note: score() automatically registers each classifier for its output labels
338
        for classifier in self.classifiers:
1✔
339
            classifier.score(result)
1✔
340

341
        # 2. Construct (Top-Down)
342
        # Find the PageClassifier to start the construction process
343
        page_classifier = next(
1✔
344
            c for c in self.classifiers if isinstance(c, PageClassifier)
345
        )
346
        page_classifier.build_all(result)
1✔
347

348
        # TODO Do we actualy ever add warnings?
349
        warnings = self._log_post_classification_warnings(page_data, result)
1✔
350
        for warning in warnings:
1✔
351
            result.add_warning(warning)
1✔
352

353
        return result
1✔
354

355
    def _log_post_classification_warnings(
1✔
356
        self, page_data: PageData, result: ClassificationResult
357
    ) -> list[str]:
358
        warnings = []
1✔
359

360
        # Check if there's a page number
361
        page_numbers = result.get_winners_by_score("page_number", PageNumber)
1✔
362
        if not page_numbers:
1✔
363
            warnings.append(f"Page {page_data.page_number}: missing page number")
1✔
364

365
        # Get elements by label
366
        parts_lists = result.get_winners_by_score("parts_list", PartsList)
1✔
367
        part_counts = result.get_winners_by_score("part_count", PartCount)
1✔
368

369
        for pl in parts_lists:
1✔
370
            inside_counts = [t for t in part_counts if t.bbox.fully_inside(pl.bbox)]
1✔
371
            if not inside_counts:
1✔
372
                warnings.append(
×
373
                    f"Page {page_data.page_number}: parts list at {pl.bbox} "
374
                    f"contains no part counts"
375
                )
376

377
        steps = result.get_winners_by_score("step_number", StepNumber)
1✔
378
        ABOVE_EPS = 2.0
1✔
379
        for step in steps:
1✔
380
            sb = step.bbox
1✔
381
            above = [pl for pl in parts_lists if pl.bbox.y1 <= sb.y0 + ABOVE_EPS]
1✔
382
            if not above:
1✔
383
                warnings.append(
1✔
384
                    f"Page {page_data.page_number}: step number '{step.value}' "
385
                    f"at {sb} has no parts list above it"
386
                )
387
        return warnings
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc