• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19724286935

27 Nov 2025 03:27AM UTC coverage: 89.068% (-2.2%) from 91.307%
19724286935

push

github

bramp
test(pdf_extract:classifier): Refactor tests to use fixtures and remove integration test

This commit refactors the classifier test suite to improve maintainability and isolation.

Key changes:
- **Fixtures:** Introduced `conftest.py` with `classifier` and `candidate_factory` fixtures to streamline test setup and candidate creation.
- **Unit Test Conversion:** Updated `parts_classifier_test.py`, `step_classifier_test.py`, `page_number_classifier_test.py`, `part_count_classifier_test.py`, and `step_number_classifier_test.py` to use these fixtures, removing dependency on `classify_elements` and making them true unit tests.
- **Integration Test Removal:** Deleted `test_font_size_integration.py` (and its temporary rename `font_size_scoring_test.py`) as its logic has been moved into the respective classifier unit tests.
- **Cleanup:** Removed debug prints and ensured strict type checking compliance.

382 of 389 new or added lines in 9 files covered. (98.2%)

291 existing lines in 28 files now uncovered.

7585 of 8516 relevant lines covered (89.07%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.96
/src/build_a_long/pdf_extract/classifier/classifier.py
1
"""
2
Rule-based classifier for labeling page elements.
3

4
Pipeline order and dependencies
5
--------------------------------
6
The classification pipeline operates in two main phases:
7

8
1. **Bottom-up Scoring**: All classifiers run independently to identify potential
9
   candidates (e.g. page numbers, part counts, step numbers) and score them based
10
   on heuristics. No construction of final elements happens here.
11

12
2. **Conflict Resolution**: Global conflict resolution logic runs to identify
13
   cases where a single element is claimed by multiple candidates (e.g. a number
14
   could be a step number or a piece length). Candidates are prioritized and
15
   filtered.
16

17
3. **Top-down Construction**: The root `PageClassifier` is invoked to construct
18
   the final `Page` object. It recursively requests the construction of its
19
   dependencies (e.g. "Give me the best PageNumber"), which in turn construct
20
   their own dependencies. This ensures a consistent and validated object tree.
21

22
"""
23

24
from __future__ import annotations
1✔
25

26
import logging
1✔
27

28
from build_a_long.pdf_extract.classifier.bag_number_classifier import (
1✔
29
    BagNumberClassifier,
30
)
31
from build_a_long.pdf_extract.classifier.block_filter import filter_duplicate_blocks
1✔
32
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
33
    BatchClassificationResult,
34
    ClassificationResult,
35
    ClassifierConfig,
36
    RemovalReason,
37
)
38
from build_a_long.pdf_extract.classifier.diagram_classifier import (
1✔
39
    DiagramClassifier,
40
)
41
from build_a_long.pdf_extract.classifier.font_size_hints import FontSizeHints
1✔
42
from build_a_long.pdf_extract.classifier.label_classifier import LabelClassifier
1✔
43
from build_a_long.pdf_extract.classifier.new_bag_classifier import (
1✔
44
    NewBagClassifier,
45
)
46
from build_a_long.pdf_extract.classifier.page_classifier import PageClassifier
1✔
47
from build_a_long.pdf_extract.classifier.page_hints import PageHints
1✔
48
from build_a_long.pdf_extract.classifier.page_number_classifier import (
1✔
49
    PageNumberClassifier,
50
)
51
from build_a_long.pdf_extract.classifier.part_count_classifier import (
1✔
52
    PartCountClassifier,
53
)
54
from build_a_long.pdf_extract.classifier.part_number_classifier import (
1✔
55
    PartNumberClassifier,
56
)
57
from build_a_long.pdf_extract.classifier.parts_classifier import (
1✔
58
    PartsClassifier,
59
)
60
from build_a_long.pdf_extract.classifier.parts_image_classifier import (
1✔
61
    PartsImageClassifier,
62
)
63
from build_a_long.pdf_extract.classifier.parts_list_classifier import (
1✔
64
    PartsListClassifier,
65
)
66
from build_a_long.pdf_extract.classifier.piece_length_classifier import (
1✔
67
    PieceLengthClassifier,
68
)
69
from build_a_long.pdf_extract.classifier.progress_bar_classifier import (
1✔
70
    ProgressBarClassifier,
71
)
72
from build_a_long.pdf_extract.classifier.step_classifier import (
1✔
73
    StepClassifier,
74
)
75
from build_a_long.pdf_extract.classifier.step_number_classifier import (
1✔
76
    StepNumberClassifier,
77
)
78
from build_a_long.pdf_extract.classifier.text_histogram import TextHistogram
1✔
79
from build_a_long.pdf_extract.extractor import PageData
1✔
80
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
81
    PageNumber,
82
    PartCount,
83
    PartsList,
84
    StepNumber,
85
)
86
from build_a_long.pdf_extract.extractor.page_blocks import Blocks
1✔
87

88
logger = logging.getLogger(__name__)
1✔
89

90

91
def classify_elements(page: PageData) -> ClassificationResult:
1✔
92
    """Classify and label elements on a single page using rule-based heuristics.
93

94
    Args:
95
        page: A single PageData object to classify.
96

97
    Returns:
98
        A ClassificationResult object containing the classification results.
99
    """
100
    config = ClassifierConfig()
1✔
101
    classifier = Classifier(config)
1✔
102

103
    return classifier.classify(page)
1✔
104

105

106
def classify_pages(
1✔
107
    pages: list[PageData], pages_for_hints: list[PageData] | None = None
108
) -> BatchClassificationResult:
109
    """Classify and label elements across multiple pages using rule-based heuristics.
110

111
    This function performs a three-phase process:
112
    1. Filtering phase: Mark duplicate/similar blocks as removed on each page
113
    2. Analysis phase: Build font size hints from text properties (excluding
114
       removed blocks)
115
    3. Classification phase: Use hints to guide element classification
116

117
    Args:
118
        pages: A list of PageData objects to classify.
119
        pages_for_hints: Optional list of pages to use for generating font/page hints.
120
            If None, uses `pages`. This allows generating hints from all pages
121
            while only classifying a subset (e.g., when using --pages filter).
122

123
    Returns:
124
        BatchClassificationResult containing per-page results and global histogram
125
    """
126
    # Use all pages for hint generation if provided, otherwise use selected pages
127
    hint_pages = pages_for_hints if pages_for_hints is not None else pages
1✔
128

129
    # Phase 1: Filter duplicate blocks on each page and track removals
130
    duplicate_removals: list[dict[Blocks, Blocks]] = []
1✔
131
    for page_data in pages:
1✔
132
        # Get blocks to keep and mapping of removed blocks
133
        kept_blocks, removed_mapping = filter_duplicate_blocks(page_data.blocks)
1✔
134

135
        logger.debug(
1✔
136
            f"Page {page_data.page_number}: "
137
            f"filtered {len(removed_mapping)} duplicate blocks"
138
        )
139

140
        duplicate_removals.append(removed_mapping)
1✔
141

142
    # Phase 2: Extract font size hints from hint pages (excluding removed blocks)
143
    # Build pages with non-removed blocks for hint extraction and histogram
144

145
    # Filter duplicates from hint pages (may be different from pages to classify)
146
    hint_pages_without_duplicates = []
1✔
147
    for page_data in hint_pages:
1✔
148
        # TODO We are re-filtering duplicates here; optimize by changing the API
149
        # to accept one list of PageData, and seperate by page_numbers.
150
        kept_blocks, _ = filter_duplicate_blocks(page_data.blocks)
1✔
151
        hint_pages_without_duplicates.append(
1✔
152
            PageData(
153
                page_number=page_data.page_number,
154
                bbox=page_data.bbox,
155
                blocks=kept_blocks,
156
            )
157
        )
158

159
    # Build pages without duplicates for classification
160
    pages_without_duplicates = []
1✔
161
    for page_data, removed_mapping in zip(pages, duplicate_removals, strict=True):
1✔
162
        non_removed_blocks = [
1✔
163
            block for block in page_data.blocks if block not in removed_mapping
164
        ]
165
        pages_without_duplicates.append(
1✔
166
            PageData(
167
                page_number=page_data.page_number,
168
                bbox=page_data.bbox,
169
                blocks=non_removed_blocks,
170
            )
171
        )
172

173
    # Generate hints from hint pages, histogram from pages to classify
174
    font_size_hints = FontSizeHints.from_pages(hint_pages_without_duplicates)
1✔
175
    page_hints = PageHints.from_pages(hint_pages_without_duplicates)
1✔
176
    histogram = TextHistogram.from_pages(pages_without_duplicates)
1✔
177

178
    # Phase 3: Classify using the hints (on pages without duplicates)
179
    config = ClassifierConfig(font_size_hints=font_size_hints, page_hints=page_hints)
1✔
180
    classifier = Classifier(config)
1✔
181

182
    results = []
1✔
183
    for page_data, page_without_duplicates, removed_mapping in zip(
1✔
184
        pages, pages_without_duplicates, duplicate_removals, strict=True
185
    ):
186
        # Classify using only non-removed blocks
187
        result = classifier.classify(page_without_duplicates)
1✔
188

189
        # Update result to use original page_data (with all blocks)
190
        result.page_data = page_data
1✔
191

192
        # Mark duplicate blocks as removed
193
        for removed_block, kept_block in removed_mapping.items():
1✔
194
            result.mark_removed(
1✔
195
                removed_block,
196
                RemovalReason(reason_type="duplicate_bbox", target_block=kept_block),
197
            )
198

199
        results.append(result)
1✔
200

201
    return BatchClassificationResult(results=results, histogram=histogram)
1✔
202

203

204
type Classifiers = (
1✔
205
    PageNumberClassifier
206
    | ProgressBarClassifier
207
    | BagNumberClassifier
208
    | PartCountClassifier
209
    | PartNumberClassifier
210
    | StepNumberClassifier
211
    | PieceLengthClassifier
212
    | PartsClassifier
213
    | PartsListClassifier
214
    | PartsImageClassifier
215
    | NewBagClassifier
216
    | DiagramClassifier
217
    | StepClassifier
218
    | PageClassifier
219
)
220

221

222
class Classifier:
1✔
223
    """
224
    Performs a single run of classification based on rules, configuration, and hints.
225
    This class should be stateless.
226
    """
227

228
    def __init__(self, config: ClassifierConfig):
1✔
229
        self.config = config
1✔
230
        self.classifiers: list[LabelClassifier] = [
1✔
231
            PageNumberClassifier(config),
232
            ProgressBarClassifier(config),
233
            BagNumberClassifier(config),
234
            PartCountClassifier(config),
235
            PartNumberClassifier(config),
236
            StepNumberClassifier(config),
237
            PieceLengthClassifier(config),
238
            PartsClassifier(config),
239
            PartsListClassifier(config),
240
            PartsImageClassifier(config),
241
            NewBagClassifier(config),
242
            DiagramClassifier(config),
243
            StepClassifier(config),
244
            PageClassifier(config),
245
        ]
246

247
    def classify(self, page_data: PageData) -> ClassificationResult:
1✔
248
        """
249
        Runs the classification logic and returns a result.
250
        It does NOT modify page_data directly.
251

252
        The classification process runs in three phases:
253
        1. Score all classifiers (bottom-up)
254
        2. Resolve conflicts (global)
255
        3. Construct final elements (top-down starting from Page)
256
        """
257
        result = ClassificationResult(page_data=page_data)
1✔
258

259
        logger.debug(f"Starting classification for page {page_data.page_number}")
1✔
260

261
        # 0. Register classifiers
262
        for classifier in self.classifiers:
1✔
263
            for label in classifier.outputs:
1✔
264
                result.register_classifier(label, classifier)
1✔
265

266
        # 1. Score all classifiers (Bottom-Up)
267
        for classifier in self.classifiers:
1✔
268
            classifier.score(result)
1✔
269

270
        # 2. Resolve conflicts
271
        # resolve_label_conflicts(result)
272

273
        # 3. Construct (Top-Down)
274
        # Find the PageClassifier to start the construction process
275
        page_classifier = next(
1✔
276
            c for c in self.classifiers if isinstance(c, PageClassifier)
277
        )
278
        page_classifier.construct(result)
1✔
279

280
        warnings = self._log_post_classification_warnings(page_data, result)
1✔
281
        for warning in warnings:
1✔
282
            result.add_warning(warning)
1✔
283

284
        return result
1✔
285

286
    def _log_post_classification_warnings(
1✔
287
        self, page_data: PageData, result: ClassificationResult
288
    ) -> list[str]:
289
        warnings = []
1✔
290

291
        # Check if there's a page number
292
        page_numbers = result.get_winners_by_score("page_number", PageNumber)
1✔
293
        if not page_numbers:
1✔
294
            warnings.append(f"Page {page_data.page_number}: missing page number")
1✔
295

296
        # Get elements by label
297
        parts_lists = result.get_winners_by_score("parts_list", PartsList)
1✔
298
        part_counts = result.get_winners_by_score("part_count", PartCount)
1✔
299

300
        for pl in parts_lists:
1✔
301
            inside_counts = [t for t in part_counts if t.bbox.fully_inside(pl.bbox)]
1✔
302
            if not inside_counts:
1✔
UNCOV
303
                warnings.append(
×
304
                    f"Page {page_data.page_number}: parts list at {pl.bbox} "
305
                    f"contains no part counts"
306
                )
307

308
        steps = result.get_winners_by_score("step_number", StepNumber)
1✔
309
        ABOVE_EPS = 2.0
1✔
310
        for step in steps:
1✔
311
            sb = step.bbox
1✔
312
            above = [pl for pl in parts_lists if pl.bbox.y1 <= sb.y0 + ABOVE_EPS]
1✔
313
            if not above:
1✔
314
                warnings.append(
1✔
315
                    f"Page {page_data.page_number}: step number '{step.value}' "
316
                    f"at {sb} has no parts list above it"
317
                )
318
        return warnings
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc