• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19747132032

27 Nov 2025 08:21PM UTC coverage: 89.777% (+0.001%) from 89.776%
19747132032

push

github

bramp
Refactor(classifier): Move bag-related classifiers to new 'bags' directory

Moves `bag_number_classifier.py` and `new_bag_classifier.py` (and their
corresponding test files) into a new subdirectory:
`src/build_a_long/pdf_extract/classifier/bags/`.

This refactoring aims to improve code organization and modularity within
the classifier component. A new `__init__.py` file is created in the
`bags` directory to facilitate easier imports. Associated `BUILD` files
and import statements across the codebase have been updated to reflect
these changes.

6 of 6 new or added lines in 4 files covered. (100.0%)

19 existing lines in 3 files now uncovered.

7324 of 8158 relevant lines covered (89.78%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.91
/src/build_a_long/pdf_extract/classifier/classifier.py
1
"""
2
Rule-based classifier for labeling page elements.
3

4
Pipeline order and dependencies
5
--------------------------------
6
The classification pipeline operates in two main phases:
7

8
1. **Bottom-up Scoring**: All classifiers run independently to identify potential
9
   candidates (e.g. page numbers, part counts, step numbers) and score them based
10
   on heuristics. No construction of final elements happens here.
11

12
2. **Top-down Construction**: The root `PageClassifier` is invoked to construct
13
   the final `Page` object. It recursively requests the construction of its
14
   dependencies (e.g. "Give me the best PageNumber"), which in turn construct
15
   their own dependencies. This ensures a consistent and validated object tree.
16

17
"""
18

19
from __future__ import annotations
1✔
20

21
import logging
1✔
22

23
from build_a_long.pdf_extract.classifier.bags import (
1✔
24
    BagNumberClassifier,
25
    NewBagClassifier,
26
)
27
from build_a_long.pdf_extract.classifier.block_filter import filter_duplicate_blocks
1✔
28
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
29
    BatchClassificationResult,
30
    ClassificationResult,
31
    ClassifierConfig,
32
    RemovalReason,
33
)
34
from build_a_long.pdf_extract.classifier.diagram_classifier import (
1✔
35
    DiagramClassifier,
36
)
37
from build_a_long.pdf_extract.classifier.font_size_hints import FontSizeHints
1✔
38
from build_a_long.pdf_extract.classifier.label_classifier import LabelClassifier
1✔
39
from build_a_long.pdf_extract.classifier.page_classifier import PageClassifier
1✔
40
from build_a_long.pdf_extract.classifier.page_hints import PageHints
1✔
41
from build_a_long.pdf_extract.classifier.page_number_classifier import (
1✔
42
    PageNumberClassifier,
43
)
44
from build_a_long.pdf_extract.classifier.part_count_classifier import (
1✔
45
    PartCountClassifier,
46
)
47
from build_a_long.pdf_extract.classifier.part_number_classifier import (
1✔
48
    PartNumberClassifier,
49
)
50
from build_a_long.pdf_extract.classifier.parts_classifier import (
1✔
51
    PartsClassifier,
52
)
53
from build_a_long.pdf_extract.classifier.parts_image_classifier import (
1✔
54
    PartsImageClassifier,
55
)
56
from build_a_long.pdf_extract.classifier.parts_list_classifier import (
1✔
57
    PartsListClassifier,
58
)
59
from build_a_long.pdf_extract.classifier.piece_length_classifier import (
1✔
60
    PieceLengthClassifier,
61
)
62
from build_a_long.pdf_extract.classifier.progress_bar_classifier import (
1✔
63
    ProgressBarClassifier,
64
)
65
from build_a_long.pdf_extract.classifier.step_classifier import (
1✔
66
    StepClassifier,
67
)
68
from build_a_long.pdf_extract.classifier.step_number_classifier import (
1✔
69
    StepNumberClassifier,
70
)
71
from build_a_long.pdf_extract.classifier.text_histogram import TextHistogram
1✔
72
from build_a_long.pdf_extract.extractor import PageData
1✔
73
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
74
    PageNumber,
75
    PartCount,
76
    PartsList,
77
    StepNumber,
78
)
79
from build_a_long.pdf_extract.extractor.page_blocks import Blocks
1✔
80

81
logger = logging.getLogger(__name__)
1✔
82

83

84
def classify_elements(page: PageData) -> ClassificationResult:
1✔
85
    """Classify and label elements on a single page using rule-based heuristics.
86

87
    Args:
88
        page: A single PageData object to classify.
89

90
    Returns:
91
        A ClassificationResult object containing the classification results.
92
    """
93
    config = ClassifierConfig()
1✔
94
    classifier = Classifier(config)
1✔
95

96
    return classifier.classify(page)
1✔
97

98

99
def classify_pages(
1✔
100
    pages: list[PageData], pages_for_hints: list[PageData] | None = None
101
) -> BatchClassificationResult:
102
    """Classify and label elements across multiple pages using rule-based heuristics.
103

104
    This function performs a three-phase process:
105
    1. Filtering phase: Mark duplicate/similar blocks as removed on each page
106
    2. Analysis phase: Build font size hints from text properties (excluding
107
       removed blocks)
108
    3. Classification phase: Use hints to guide element classification
109

110
    Args:
111
        pages: A list of PageData objects to classify.
112
        pages_for_hints: Optional list of pages to use for generating font/page hints.
113
            If None, uses `pages`. This allows generating hints from all pages
114
            while only classifying a subset (e.g., when using --pages filter).
115

116
    Returns:
117
        BatchClassificationResult containing per-page results and global histogram
118
    """
119
    # Use all pages for hint generation if provided, otherwise use selected pages
120
    hint_pages = pages_for_hints if pages_for_hints is not None else pages
1✔
121

122
    # Phase 1: Filter duplicate blocks on each page and track removals
123
    duplicate_removals: list[dict[Blocks, Blocks]] = []
1✔
124
    for page_data in pages:
1✔
125
        # Get blocks to keep and mapping of removed blocks
126
        kept_blocks, removed_mapping = filter_duplicate_blocks(page_data.blocks)
1✔
127

128
        logger.debug(
1✔
129
            f"Page {page_data.page_number}: "
130
            f"filtered {len(removed_mapping)} duplicate blocks"
131
        )
132

133
        duplicate_removals.append(removed_mapping)
1✔
134

135
    # Phase 2: Extract font size hints from hint pages (excluding removed blocks)
136
    # Build pages with non-removed blocks for hint extraction and histogram
137

138
    # Filter duplicates from hint pages (may be different from pages to classify)
139
    hint_pages_without_duplicates = []
1✔
140
    for page_data in hint_pages:
1✔
141
        # TODO We are re-filtering duplicates here; optimize by changing the API
142
        # to accept one list of PageData, and seperate by page_numbers.
143
        kept_blocks, _ = filter_duplicate_blocks(page_data.blocks)
1✔
144
        hint_pages_without_duplicates.append(
1✔
145
            PageData(
146
                page_number=page_data.page_number,
147
                bbox=page_data.bbox,
148
                blocks=kept_blocks,
149
            )
150
        )
151

152
    # Build pages without duplicates for classification
153
    pages_without_duplicates = []
1✔
154
    for page_data, removed_mapping in zip(pages, duplicate_removals, strict=True):
1✔
155
        non_removed_blocks = [
1✔
156
            block for block in page_data.blocks if block not in removed_mapping
157
        ]
158
        pages_without_duplicates.append(
1✔
159
            PageData(
160
                page_number=page_data.page_number,
161
                bbox=page_data.bbox,
162
                blocks=non_removed_blocks,
163
            )
164
        )
165

166
    # Generate hints from hint pages, histogram from pages to classify
167
    font_size_hints = FontSizeHints.from_pages(hint_pages_without_duplicates)
1✔
168
    page_hints = PageHints.from_pages(hint_pages_without_duplicates)
1✔
169
    histogram = TextHistogram.from_pages(pages_without_duplicates)
1✔
170

171
    # Phase 3: Classify using the hints (on pages without duplicates)
172
    config = ClassifierConfig(font_size_hints=font_size_hints, page_hints=page_hints)
1✔
173
    classifier = Classifier(config)
1✔
174

175
    results = []
1✔
176
    for page_data, page_without_duplicates, removed_mapping in zip(
1✔
177
        pages, pages_without_duplicates, duplicate_removals, strict=True
178
    ):
179
        # Classify using only non-removed blocks
180
        result = classifier.classify(page_without_duplicates)
1✔
181

182
        # Update result to use original page_data (with all blocks)
183
        result.page_data = page_data
1✔
184

185
        # Mark duplicate blocks as removed
186
        for removed_block, kept_block in removed_mapping.items():
1✔
187
            result.mark_removed(
1✔
188
                removed_block,
189
                RemovalReason(reason_type="duplicate_bbox", target_block=kept_block),
190
            )
191

192
        results.append(result)
1✔
193

194
    return BatchClassificationResult(results=results, histogram=histogram)
1✔
195

196

197
type Classifiers = (
1✔
198
    PageNumberClassifier
199
    | ProgressBarClassifier
200
    | BagNumberClassifier
201
    | PartCountClassifier
202
    | PartNumberClassifier
203
    | StepNumberClassifier
204
    | PieceLengthClassifier
205
    | PartsClassifier
206
    | PartsListClassifier
207
    | PartsImageClassifier
208
    | NewBagClassifier
209
    | DiagramClassifier
210
    | StepClassifier
211
    | PageClassifier
212
)
213

214

215
class Classifier:
1✔
216
    """
217
    Performs a single run of classification based on rules, configuration, and hints.
218
    This class should be stateless.
219
    """
220

221
    def __init__(self, config: ClassifierConfig):
1✔
222
        self.config = config
1✔
223
        self.classifiers: list[LabelClassifier] = [
1✔
224
            PageNumberClassifier(config),
225
            ProgressBarClassifier(config),
226
            BagNumberClassifier(config),
227
            PartCountClassifier(config),
228
            PartNumberClassifier(config),
229
            StepNumberClassifier(config),
230
            PieceLengthClassifier(config),
231
            PartsClassifier(config),
232
            PartsListClassifier(config),
233
            PartsImageClassifier(config),
234
            NewBagClassifier(config),
235
            DiagramClassifier(config),
236
            StepClassifier(config),
237
            PageClassifier(config),
238
        ]
239

240
        # TODO Topological sort classifiers based on dependencies
241

242
    def classify(self, page_data: PageData) -> ClassificationResult:
1✔
243
        """
244
        Runs the classification logic and returns a result.
245
        It does NOT modify page_data directly.
246

247
        The classification process runs in three phases:
248
        1. Score all classifiers (bottom-up) - auto-registers classifiers
249
        2. Construct final elements (top-down starting from Page)
250
        """
251
        result = ClassificationResult(page_data=page_data)
1✔
252

253
        logger.debug(f"Starting classification for page {page_data.page_number}")
1✔
254

255
        # 1. Score all classifiers (Bottom-Up)
256
        # Note: score() automatically registers each classifier for its output labels
257
        for classifier in self.classifiers:
1✔
258
            classifier.score(result)
1✔
259

260
        # 2. Construct (Top-Down)
261
        # Find the PageClassifier to start the construction process
262
        page_classifier = next(
1✔
263
            c for c in self.classifiers if isinstance(c, PageClassifier)
264
        )
265
        page_classifier.build_all(result)
1✔
266

267
        # TODO Do we actualy ever add warnings?
268
        warnings = self._log_post_classification_warnings(page_data, result)
1✔
269
        for warning in warnings:
1✔
270
            result.add_warning(warning)
1✔
271

272
        return result
1✔
273

274
    def _log_post_classification_warnings(
1✔
275
        self, page_data: PageData, result: ClassificationResult
276
    ) -> list[str]:
277
        warnings = []
1✔
278

279
        # Check if there's a page number
280
        page_numbers = result.get_winners_by_score("page_number", PageNumber)
1✔
281
        if not page_numbers:
1✔
282
            warnings.append(f"Page {page_data.page_number}: missing page number")
1✔
283

284
        # Get elements by label
285
        parts_lists = result.get_winners_by_score("parts_list", PartsList)
1✔
286
        part_counts = result.get_winners_by_score("part_count", PartCount)
1✔
287

288
        for pl in parts_lists:
1✔
289
            inside_counts = [t for t in part_counts if t.bbox.fully_inside(pl.bbox)]
1✔
290
            if not inside_counts:
1✔
UNCOV
291
                warnings.append(
×
292
                    f"Page {page_data.page_number}: parts list at {pl.bbox} "
293
                    f"contains no part counts"
294
                )
295

296
        steps = result.get_winners_by_score("step_number", StepNumber)
1✔
297
        ABOVE_EPS = 2.0
1✔
298
        for step in steps:
1✔
299
            sb = step.bbox
1✔
300
            above = [pl for pl in parts_lists if pl.bbox.y1 <= sb.y0 + ABOVE_EPS]
1✔
301
            if not above:
1✔
302
                warnings.append(
1✔
303
                    f"Page {page_data.page_number}: step number '{step.value}' "
304
                    f"at {sb} has no parts list above it"
305
                )
306
        return warnings
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc