• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19521244091

15 Nov 2025 02:10AM UTC coverage: 90.833% (-0.4%) from 91.217%
19521244091

push

github

bramp
refactor: remove unused code and simplify domain invariant tests

Removed approximately 220 lines of unused/redundant code from classifier tests:

classifier_rules_test.py:
- Removed ClassifiedPage wrapper class (~110 lines) - never instantiated
- Removed helper functions (_parts_lists, _part_images, _part_counts,
  _print_label_counts) - never called
- Cleaned up unused imports (defaultdict, Block, ClassificationResult)
- Updated docstring to reflect remaining test coverage

domain_invariants_test.py:
- Simplified all 4 tests to use result.page property directly
- Replaced verbose 6-line get_candidates() pattern with simple property access
- Removed redundant isinstance(page, Page) assertions (~48 lines total)
- Tests now more clearly express intent: validate Page/PartsList/Part objects

All tests continue to pass. No functionality was lost.

4 of 4 new or added lines in 2 files covered. (100.0%)

151 existing lines in 7 files now uncovered.

4994 of 5498 relevant lines covered (90.83%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.84
/src/build_a_long/pdf_extract/classifier/classifier.py
1
"""
2
Rule-based classifier for labeling page elements.
3

4
Pipeline order and dependencies
5
--------------------------------
6
Classifiers run in a fixed, enforced order because later stages depend on
7
labels produced by earlier stages:
8

9
1) PageNumberClassifier  → outputs: "page_number"
10
2) ProgressBarClassifier → outputs: "progress_bar" (optional, near page_number)
11
3) PartCountClassifier   → outputs: "part_count"
12
4) PartNumberClassifier  → outputs: "part_number" (catalog pages)
13
5) StepNumberClassifier  → outputs: "step_number" (uses page_number size)
14
6) PartsClassifier       → outputs: "part" (requires part_count, pairs with images)
15
7) PartsListClassifier   → outputs: "parts_list" (requires part)
16
8) PartsImageClassifier  → outputs: "part_image" (requires parts_list, part_count)
17
9) StepClassifier        → outputs: "step" (requires step_number and parts_list)
18
10) PageClassifier       → outputs: "page" (requires page_number and step)
19

20
If the order is changed such that a classifier runs before its requirements
21
are available, a ValueError will be raised at initialization time.
22
"""
23

24
from __future__ import annotations
1✔
25

26
import logging
1✔
27

28
from build_a_long.pdf_extract.classifier.block_filter import filter_duplicate_blocks
1✔
29
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
30
    BatchClassificationResult,
31
    ClassificationResult,
32
    ClassifierConfig,
33
    RemovalReason,
34
)
35
from build_a_long.pdf_extract.classifier.font_size_hints import FontSizeHints
1✔
36
from build_a_long.pdf_extract.classifier.page_classifier import PageClassifier
1✔
37
from build_a_long.pdf_extract.classifier.page_number_classifier import (
1✔
38
    PageNumberClassifier,
39
)
40
from build_a_long.pdf_extract.classifier.part_count_classifier import (
1✔
41
    PartCountClassifier,
42
)
43
from build_a_long.pdf_extract.classifier.part_number_classifier import (
1✔
44
    PartNumberClassifier,
45
)
46
from build_a_long.pdf_extract.classifier.parts_classifier import (
1✔
47
    PartsClassifier,
48
)
49
from build_a_long.pdf_extract.classifier.parts_image_classifier import (
1✔
50
    PartsImageClassifier,
51
)
52
from build_a_long.pdf_extract.classifier.parts_list_classifier import (
1✔
53
    PartsListClassifier,
54
)
55
from build_a_long.pdf_extract.classifier.progress_bar_classifier import (
1✔
56
    ProgressBarClassifier,
57
)
58
from build_a_long.pdf_extract.classifier.step_classifier import (
1✔
59
    StepClassifier,
60
)
61
from build_a_long.pdf_extract.classifier.step_number_classifier import (
1✔
62
    StepNumberClassifier,
63
)
64
from build_a_long.pdf_extract.classifier.text_histogram import TextHistogram
1✔
65
from build_a_long.pdf_extract.extractor import PageData
1✔
66
from build_a_long.pdf_extract.extractor.page_blocks import Block, Text
1✔
67

68
logger = logging.getLogger(__name__)
1✔
69

70

71
def classify_elements(page: PageData) -> ClassificationResult:
1✔
72
    """Classify and label elements on a single page using rule-based heuristics.
73

74
    Args:
75
        page: A single PageData object to classify.
76

77
    Returns:
78
        A ClassificationResult object containing the classification results.
79
    """
80
    config = ClassifierConfig()
1✔
81
    classifier = Classifier(config)
1✔
82

83
    return classifier.classify(page)
1✔
84

85

86
def classify_pages(pages: list[PageData]) -> BatchClassificationResult:
1✔
87
    """Classify and label elements across multiple pages using rule-based heuristics.
88

89
    This function performs a three-phase process:
90
    1. Filtering phase: Mark duplicate/similar blocks as removed on each page
91
    2. Analysis phase: Build font size hints from text properties (excluding
92
       removed blocks)
93
    3. Classification phase: Use hints to guide element classification
94

95
    Args:
96
        pages: A list of PageData objects to classify.
97

98
    Returns:
99
        BatchClassificationResult containing per-page results and global histogram
100
    """
101
    # Phase 1: Filter duplicate blocks on each page and track removals
102
    duplicate_removals: list[dict[Block, Block]] = []
1✔
103
    for page_data in pages:
1✔
104
        # Get blocks to keep and mapping of removed blocks
105
        kept_blocks, removed_mapping = filter_duplicate_blocks(page_data.blocks)
1✔
106

107
        logger.debug(
1✔
108
            f"Page {page_data.page_number}: "
109
            f"filtered {len(removed_mapping)} duplicate blocks"
110
        )
111

112
        duplicate_removals.append(removed_mapping)
1✔
113

114
    # Phase 2: Extract font size hints from all pages (excluding removed blocks)
115
    # Build pages with non-removed blocks for hint extraction and histogram
116
    pages_without_duplicates = []
1✔
117
    for page_data, removed_mapping in zip(pages, duplicate_removals, strict=True):
1✔
118
        non_removed_blocks = [
1✔
119
            block for block in page_data.blocks if block not in removed_mapping
120
        ]
121
        pages_without_duplicates.append(
1✔
122
            PageData(
123
                page_number=page_data.page_number,
124
                bbox=page_data.bbox,
125
                blocks=non_removed_blocks,
126
            )
127
        )
128

129
    font_size_hints = FontSizeHints.from_pages(pages_without_duplicates)
1✔
130
    histogram = TextHistogram.from_pages(pages_without_duplicates)
1✔
131

132
    # Phase 3: Classify using the hints (on pages without duplicates)
133
    config = ClassifierConfig(font_size_hints=font_size_hints)
1✔
134
    classifier = Classifier(config)
1✔
135

136
    results = []
1✔
137
    for page_data, page_without_duplicates, removed_mapping in zip(
1✔
138
        pages, pages_without_duplicates, duplicate_removals, strict=True
139
    ):
140
        # Classify using only non-removed blocks
141
        result = classifier.classify(page_without_duplicates)
1✔
142

143
        # Update result to use original page_data (with all blocks)
144
        result.page_data = page_data
1✔
145

146
        # Mark duplicate blocks as removed
147
        for removed_block, kept_block in removed_mapping.items():
1✔
148
            result.mark_removed(
1✔
149
                removed_block,
150
                RemovalReason(reason_type="duplicate_bbox", target_block=kept_block),
151
            )
152

153
        results.append(result)
1✔
154

155
    return BatchClassificationResult(results=results, histogram=histogram)
1✔
156

157

158
type Classifiers = (
1✔
159
    PageNumberClassifier
160
    | ProgressBarClassifier
161
    | PartCountClassifier
162
    | PartNumberClassifier
163
    | StepNumberClassifier
164
    | PartsClassifier
165
    | PartsListClassifier
166
    | PartsImageClassifier
167
    | StepClassifier
168
    | PageClassifier
169
)
170

171

172
class Classifier:
1✔
173
    """
174
    Performs a single run of classification based on rules, configuration, and hints.
175
    This class should be stateless.
176
    """
177

178
    def __init__(self, config: ClassifierConfig):
1✔
179
        self.config = config
1✔
180
        self.classifiers: list[Classifiers] = [
1✔
181
            PageNumberClassifier(config),
182
            ProgressBarClassifier(config),
183
            PartCountClassifier(config),
184
            PartNumberClassifier(config),
185
            StepNumberClassifier(config),
186
            PartsClassifier(config),
187
            PartsListClassifier(config),
188
            PartsImageClassifier(config),
189
            StepClassifier(config),
190
            PageClassifier(config),
191
        ]
192

193
        # TODO Create a directed graph, and run it in order.
194
        produced: set[str] = set()
1✔
195
        for c in self.classifiers:
1✔
196
            cls = c.__class__
1✔
197
            need = getattr(c, "requires", set())
1✔
198
            if not need.issubset(produced):
1✔
199
                missing = ", ".join(sorted(need - produced))
1✔
200
                raise ValueError(
1✔
201
                    f"Classifier order invalid: {cls.__name__} requires "
202
                    f"labels not yet produced: {missing}"
203
                )
204
            produced |= getattr(c, "outputs", set())
1✔
205

206
    def classify(self, page_data: PageData) -> ClassificationResult:
1✔
207
        """
208
        Runs the classification logic and returns a result.
209
        It does NOT modify page_data directly.
210
        """
211
        result = ClassificationResult(page_data=page_data)
1✔
212

213
        for classifier in self.classifiers:
1✔
214
            classifier.evaluate(result)
1✔
215
            classifier.classify(result)
1✔
216

217
        warnings = self._log_post_classification_warnings(page_data, result)
1✔
218
        for warning in warnings:
1✔
219
            result.add_warning(warning)
1✔
220

221
        return result
1✔
222

223
    def _log_post_classification_warnings(
1✔
224
        self, page_data: PageData, result: ClassificationResult
225
    ) -> list[str]:
226
        warnings = []
1✔
227

228
        # Check if there's a page number
229
        has_page_number = result.has_label("page_number")
1✔
230
        if not has_page_number:
1✔
231
            warnings.append(f"Page {page_data.page_number}: missing page number")
1✔
232

233
        # Get elements by label
234
        parts_lists = result.get_blocks_by_label("parts_list")
1✔
235
        part_counts = result.get_blocks_by_label("part_count")
1✔
236

237
        for pl in parts_lists:
1✔
238
            inside_counts = [t for t in part_counts if t.bbox.fully_inside(pl.bbox)]
1✔
239
            if not inside_counts:
1✔
UNCOV
240
                warnings.append(
×
241
                    f"Page {page_data.page_number}: parts list at {pl.bbox} "
242
                    f"contains no part counts"
243
                )
244

245
        steps: list[Text] = [
1✔
246
            e for e in result.get_blocks_by_label("step_number") if isinstance(e, Text)
247
        ]
248
        ABOVE_EPS = 2.0
1✔
249
        for step in steps:
1✔
250
            sb = step.bbox
1✔
251
            above = [pl for pl in parts_lists if pl.bbox.y1 <= sb.y0 + ABOVE_EPS]
1✔
252
            if not above:
1✔
253
                warnings.append(
1✔
254
                    f"Page {page_data.page_number}: step number '{step.text}' "
255
                    f"at {sb} has no parts list above it"
256
                )
257
        return warnings
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc