• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19770083201

28 Nov 2025 05:10PM UTC coverage: 90.346% (+0.6%) from 89.792%
19770083201

push

github

bramp
Update AGENTS on the rules around dataclasses vs pydantic models.

8451 of 9354 relevant lines covered (90.35%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.89
/src/build_a_long/pdf_extract/classifier/classifier.py
1
"""
2
Rule-based classifier for labeling page elements.
3

4
Pipeline order and dependencies
5
--------------------------------
6
The classification pipeline operates in two main phases:
7

8
1. **Bottom-up Scoring**: All classifiers run independently to identify potential
9
   candidates (e.g. page numbers, part counts, step numbers) and score them based
10
   on heuristics. No construction of final elements happens here.
11

12
2. **Top-down Construction**: The root `PageClassifier` is invoked to construct
13
   the final `Page` object. It recursively requests the construction of its
14
   dependencies (e.g. "Give me the best PageNumber"), which in turn construct
15
   their own dependencies. This ensures a consistent and validated object tree.
16

17
"""
18

19
from __future__ import annotations
1✔
20

21
import logging
1✔
22

23
from build_a_long.pdf_extract.classifier.bags import (
1✔
24
    BagNumberClassifier,
25
    NewBagClassifier,
26
)
27
from build_a_long.pdf_extract.classifier.batch_classification_result import (
1✔
28
    BatchClassificationResult,
29
)
30
from build_a_long.pdf_extract.classifier.block_filter import (
1✔
31
    filter_duplicate_blocks,
32
    filter_overlapping_text_blocks,
33
)
34
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
35
    ClassificationResult,
36
)
37
from build_a_long.pdf_extract.classifier.classifier_config import ClassifierConfig
1✔
38
from build_a_long.pdf_extract.classifier.pages import (
1✔
39
    PageHintCollection,
40
)
41
from build_a_long.pdf_extract.classifier.pages.page_classifier import PageClassifier
1✔
42
from build_a_long.pdf_extract.classifier.pages.page_number_classifier import (
1✔
43
    PageNumberClassifier,
44
)
45
from build_a_long.pdf_extract.classifier.pages.progress_bar_classifier import (
1✔
46
    ProgressBarClassifier,
47
)
48
from build_a_long.pdf_extract.classifier.parts import (
1✔
49
    PartCountClassifier,
50
    PartNumberClassifier,
51
    PartsClassifier,
52
    PartsImageClassifier,
53
    PartsListClassifier,
54
    PieceLengthClassifier,
55
    ShineClassifier,
56
)
57
from build_a_long.pdf_extract.classifier.removal_reason import RemovalReason
1✔
58
from build_a_long.pdf_extract.classifier.steps import (
1✔
59
    DiagramClassifier,
60
    RotationSymbolClassifier,
61
    StepClassifier,
62
    StepNumberClassifier,
63
)
64
from build_a_long.pdf_extract.classifier.text import FontSizeHints, TextHistogram
1✔
65
from build_a_long.pdf_extract.classifier.topological_sort import topological_sort
1✔
66
from build_a_long.pdf_extract.extractor import PageData
1✔
67
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
68
    PageNumber,
69
    PartCount,
70
    PartsList,
71
    StepNumber,
72
)
73
from build_a_long.pdf_extract.extractor.page_blocks import Blocks
1✔
74

75
logger = logging.getLogger(__name__)
1✔
76

77

78
def classify_elements(page: PageData) -> ClassificationResult:
1✔
79
    """Classify and label elements on a single page using rule-based heuristics.
80

81
    Args:
82
        page: A single PageData object to classify.
83

84
    Returns:
85
        A ClassificationResult object containing the classification results.
86
    """
87
    config = ClassifierConfig()
1✔
88
    classifier = Classifier(config)
1✔
89

90
    return classifier.classify(page)
1✔
91

92

93
def classify_pages(
1✔
94
    pages: list[PageData], pages_for_hints: list[PageData] | None = None
95
) -> BatchClassificationResult:
96
    """Classify and label elements across multiple pages using rule-based heuristics.
97

98
    This function performs a three-phase process:
99
    1. Filtering phase: Mark duplicate/similar blocks as removed on each page
100
    2. Analysis phase: Build font size hints from text properties (excluding
101
       removed blocks)
102
    3. Classification phase: Use hints to guide element classification
103

104
    Args:
105
        pages: A list of PageData objects to classify.
106
        pages_for_hints: Optional list of pages to use for generating font/page hints.
107
            If None, uses `pages`. This allows generating hints from all pages
108
            while only classifying a subset (e.g., when using --pages filter).
109

110
    Returns:
111
        BatchClassificationResult containing per-page results and global histogram
112
    """
113
    # Use all pages for hint generation if provided, otherwise use selected pages
114
    hint_pages = pages_for_hints if pages_for_hints is not None else pages
1✔
115

116
    # Phase 1: Filter duplicate blocks on each page and track removals
117
    duplicate_removals: list[dict[Blocks, Blocks]] = []
1✔
118
    for page_data in pages:
1✔
119
        # First filter overlapping text blocks (e.g., "4" and "43" at same origin)
120
        kept_blocks, text_removed = filter_overlapping_text_blocks(page_data.blocks)
1✔
121

122
        # Then filter duplicate image/drawing blocks based on IOU
123
        kept_blocks, bbox_removed = filter_duplicate_blocks(kept_blocks)
1✔
124

125
        # Combine both removal mappings
126
        removed_mapping = {**text_removed, **bbox_removed}
1✔
127

128
        logger.debug(
1✔
129
            f"Page {page_data.page_number}: "
130
            f"filtered {len(text_removed)} overlapping text, "
131
            f"{len(bbox_removed)} duplicate bbox blocks"
132
        )
133

134
        duplicate_removals.append(removed_mapping)
1✔
135

136
    # Phase 2: Extract font size hints from hint pages (excluding removed blocks)
137
    # Build pages with non-removed blocks for hint extraction and histogram
138

139
    # Filter duplicates from hint pages (may be different from pages to classify)
140
    hint_pages_without_duplicates = []
1✔
141
    for page_data in hint_pages:
1✔
142
        # TODO We are re-filtering duplicates here; optimize by changing the API
143
        # to accept one list of PageData, and seperate by page_numbers.
144
        kept_blocks, _ = filter_overlapping_text_blocks(page_data.blocks)
1✔
145
        kept_blocks, _ = filter_duplicate_blocks(kept_blocks)
1✔
146
        hint_pages_without_duplicates.append(
1✔
147
            PageData(
148
                page_number=page_data.page_number,
149
                bbox=page_data.bbox,
150
                blocks=kept_blocks,
151
            )
152
        )
153

154
    # Build pages without duplicates for classification
155
    pages_without_duplicates = []
1✔
156
    for page_data, removed_mapping in zip(pages, duplicate_removals, strict=True):
1✔
157
        non_removed_blocks = [
1✔
158
            block for block in page_data.blocks if block not in removed_mapping
159
        ]
160
        pages_without_duplicates.append(
1✔
161
            PageData(
162
                page_number=page_data.page_number,
163
                bbox=page_data.bbox,
164
                blocks=non_removed_blocks,
165
            )
166
        )
167

168
    # Generate hints from hint pages, histogram from pages to classify
169
    font_size_hints = FontSizeHints.from_pages(hint_pages_without_duplicates)
1✔
170
    page_hints = PageHintCollection.from_pages(hint_pages_without_duplicates)
1✔
171
    histogram = TextHistogram.from_pages(pages_without_duplicates)
1✔
172

173
    # Phase 3: Classify using the hints (on pages without duplicates)
174
    config = ClassifierConfig(font_size_hints=font_size_hints, page_hints=page_hints)
1✔
175
    classifier = Classifier(config)
1✔
176

177
    results = []
1✔
178
    for page_data, page_without_duplicates, removed_mapping in zip(
1✔
179
        pages, pages_without_duplicates, duplicate_removals, strict=True
180
    ):
181
        # Classify using only non-removed blocks
182
        result = classifier.classify(page_without_duplicates)
1✔
183

184
        # Update result to use original page_data (with all blocks)
185
        result.page_data = page_data
1✔
186

187
        # Mark duplicate blocks as removed
188
        for removed_block, kept_block in removed_mapping.items():
1✔
189
            result.mark_removed(
1✔
190
                removed_block,
191
                RemovalReason(reason_type="duplicate_bbox", target_block=kept_block),
192
            )
193

194
        results.append(result)
1✔
195

196
    return BatchClassificationResult(results=results, histogram=histogram)
1✔
197

198

199
type Classifiers = (
1✔
200
    PageNumberClassifier
201
    | ProgressBarClassifier
202
    | BagNumberClassifier
203
    | PartCountClassifier
204
    | PartNumberClassifier
205
    | StepNumberClassifier
206
    | PieceLengthClassifier
207
    | PartsClassifier
208
    | PartsListClassifier
209
    | PartsImageClassifier
210
    | ShineClassifier
211
    | NewBagClassifier
212
    | DiagramClassifier
213
    | StepClassifier
214
    | PageClassifier
215
)
216

217

218
class Classifier:
1✔
219
    """
220
    Performs a single run of classification based on rules, configuration, and hints.
221
    This class should be stateless.
222
    """
223

224
    def __init__(self, config: ClassifierConfig):
1✔
225
        self.config = config
1✔
226
        # Sort classifiers topologically based on their dependencies
227
        self.classifiers = topological_sort(
1✔
228
            [
229
                PageNumberClassifier(config),
230
                ProgressBarClassifier(config),
231
                BagNumberClassifier(config),
232
                PartCountClassifier(config),
233
                PartNumberClassifier(config),
234
                StepNumberClassifier(config),
235
                PieceLengthClassifier(config),
236
                PartsClassifier(config),
237
                PartsListClassifier(config),
238
                DiagramClassifier(config),
239
                RotationSymbolClassifier(config),
240
                PartsImageClassifier(config),
241
                ShineClassifier(config),
242
                NewBagClassifier(config),
243
                StepClassifier(config),
244
                PageClassifier(config),
245
            ]
246
        )
247

248
    def classify(self, page_data: PageData) -> ClassificationResult:
1✔
249
        """
250
        Runs the classification logic and returns a result.
251
        It does NOT modify page_data directly.
252

253
        The classification process runs in three phases:
254
        1. Score all classifiers (bottom-up) - auto-registers classifiers
255
        2. Construct final elements (top-down starting from Page)
256
        """
257
        result = ClassificationResult(page_data=page_data)
1✔
258

259
        logger.debug(f"Starting classification for page {page_data.page_number}")
1✔
260

261
        # 1. Score all classifiers (Bottom-Up)
262
        # Note: score() automatically registers each classifier for its output labels
263
        for classifier in self.classifiers:
1✔
264
            classifier.score(result)
1✔
265

266
        # 2. Construct (Top-Down)
267
        # Find the PageClassifier to start the construction process
268
        page_classifier = next(
1✔
269
            c for c in self.classifiers if isinstance(c, PageClassifier)
270
        )
271
        page_classifier.build_all(result)
1✔
272

273
        # TODO Do we actualy ever add warnings?
274
        warnings = self._log_post_classification_warnings(page_data, result)
1✔
275
        for warning in warnings:
1✔
276
            result.add_warning(warning)
1✔
277

278
        return result
1✔
279

280
    def _log_post_classification_warnings(
1✔
281
        self, page_data: PageData, result: ClassificationResult
282
    ) -> list[str]:
283
        warnings = []
1✔
284

285
        # Check if there's a page number
286
        page_numbers = result.get_winners_by_score("page_number", PageNumber)
1✔
287
        if not page_numbers:
1✔
288
            warnings.append(f"Page {page_data.page_number}: missing page number")
1✔
289

290
        # Get elements by label
291
        parts_lists = result.get_winners_by_score("parts_list", PartsList)
1✔
292
        part_counts = result.get_winners_by_score("part_count", PartCount)
1✔
293

294
        for pl in parts_lists:
1✔
295
            inside_counts = [t for t in part_counts if t.bbox.fully_inside(pl.bbox)]
1✔
296
            if not inside_counts:
1✔
297
                warnings.append(
×
298
                    f"Page {page_data.page_number}: parts list at {pl.bbox} "
299
                    f"contains no part counts"
300
                )
301

302
        steps = result.get_winners_by_score("step_number", StepNumber)
1✔
303
        ABOVE_EPS = 2.0
1✔
304
        for step in steps:
1✔
305
            sb = step.bbox
1✔
306
            above = [pl for pl in parts_lists if pl.bbox.y1 <= sb.y0 + ABOVE_EPS]
1✔
307
            if not above:
1✔
308
                warnings.append(
1✔
309
                    f"Page {page_data.page_number}: step number '{step.value}' "
310
                    f"at {sb} has no parts list above it"
311
                )
312
        return warnings
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc