• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19751674072

28 Nov 2025 01:40AM UTC coverage: 89.023% (-0.8%) from 89.847%
19751674072

push

github

bramp
refactor(classifier): reorganize text modules and break circular dependencies

Major Changes:
- Created text/ subdirectory for text-related classifier modules
- Moved text_histogram, text_extractors, font_size_hints to text/
- Created constants.py to resolve circular dependency issue

Module Organization:
- classifier/text/__init__.py: Package exports for text modules
- classifier/text/text_histogram.py: TextHistogram class
- classifier/text/text_extractors.py: Text extraction functions
- classifier/text/font_size_hints.py: FontSizeHints class

Circular Dependency Resolution:
- Created classifier/constants.py with CATALOG_ELEMENT_ID_THRESHOLD
- Removed ClassVar from PageHintCollection
- Updated font_size_hints.py and page_hint_collection.py to import from constants
- Fixed package-level circular import by importing TextHistogram directly from module
- Added TODO to consider moving constant to ClassifierConfig

Bug Fixes:
- Fixed DrawableItem frozen model issue in drawing.py
- Create new instances with depth instead of mutating frozen objects

Import Updates:
- Updated all imports across ~15 files to use new module paths
- Updated classifier/__init__.py to re-export text module classes

Tests:
- All tests passing (42/42 test files)
- Type checking passes
- Code formatted with ruff

32 of 32 new or added lines in 23 files covered. (100.0%)

180 existing lines in 19 files now uncovered.

7429 of 8345 relevant lines covered (89.02%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.85
/src/build_a_long/pdf_extract/classifier/classifier.py
1
"""
2
Rule-based classifier for labeling page elements.
3

4
Pipeline order and dependencies
5
--------------------------------
6
The classification pipeline operates in two main phases:
7

8
1. **Bottom-up Scoring**: All classifiers run independently to identify potential
9
   candidates (e.g. page numbers, part counts, step numbers) and score them based
10
   on heuristics. No construction of final elements happens here.
11

12
2. **Top-down Construction**: The root `PageClassifier` is invoked to construct
13
   the final `Page` object. It recursively requests the construction of its
14
   dependencies (e.g. "Give me the best PageNumber"), which in turn construct
15
   their own dependencies. This ensures a consistent and validated object tree.
16

17
"""
18

19
from __future__ import annotations
1✔
20

21
import logging
1✔
22

23
from build_a_long.pdf_extract.classifier.bags import (
1✔
24
    BagNumberClassifier,
25
    NewBagClassifier,
26
)
27
from build_a_long.pdf_extract.classifier.batch_classification_result import (
1✔
28
    BatchClassificationResult,
29
)
30
from build_a_long.pdf_extract.classifier.block_filter import filter_duplicate_blocks
1✔
31
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
32
    ClassificationResult,
33
)
34
from build_a_long.pdf_extract.classifier.classifier_config import ClassifierConfig
1✔
35
from build_a_long.pdf_extract.classifier.pages import (
1✔
36
    PageHintCollection,
37
)
38
from build_a_long.pdf_extract.classifier.pages.page_classifier import PageClassifier
1✔
39
from build_a_long.pdf_extract.classifier.pages.page_number_classifier import (
1✔
40
    PageNumberClassifier,
41
)
42
from build_a_long.pdf_extract.classifier.pages.progress_bar_classifier import (
1✔
43
    ProgressBarClassifier,
44
)
45
from build_a_long.pdf_extract.classifier.parts import (
1✔
46
    PartCountClassifier,
47
    PartNumberClassifier,
48
    PartsClassifier,
49
    PartsImageClassifier,
50
    PartsListClassifier,
51
    PieceLengthClassifier,
52
)
53
from build_a_long.pdf_extract.classifier.removal_reason import RemovalReason
1✔
54
from build_a_long.pdf_extract.classifier.steps import (
1✔
55
    DiagramClassifier,
56
    StepClassifier,
57
    StepNumberClassifier,
58
)
59
from build_a_long.pdf_extract.classifier.text import FontSizeHints, TextHistogram
1✔
60
from build_a_long.pdf_extract.classifier.topological_sort import topological_sort
1✔
61
from build_a_long.pdf_extract.extractor import PageData
1✔
62
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
63
    PageNumber,
64
    PartCount,
65
    PartsList,
66
    StepNumber,
67
)
68
from build_a_long.pdf_extract.extractor.page_blocks import Blocks
1✔
69

70
logger = logging.getLogger(__name__)
1✔
71

72

73
def classify_elements(page: PageData) -> ClassificationResult:
1✔
74
    """Classify and label elements on a single page using rule-based heuristics.
75

76
    Args:
77
        page: A single PageData object to classify.
78

79
    Returns:
80
        A ClassificationResult object containing the classification results.
81
    """
82
    config = ClassifierConfig()
1✔
83
    classifier = Classifier(config)
1✔
84

85
    return classifier.classify(page)
1✔
86

87

88
def classify_pages(
1✔
89
    pages: list[PageData], pages_for_hints: list[PageData] | None = None
90
) -> BatchClassificationResult:
91
    """Classify and label elements across multiple pages using rule-based heuristics.
92

93
    This function performs a three-phase process:
94
    1. Filtering phase: Mark duplicate/similar blocks as removed on each page
95
    2. Analysis phase: Build font size hints from text properties (excluding
96
       removed blocks)
97
    3. Classification phase: Use hints to guide element classification
98

99
    Args:
100
        pages: A list of PageData objects to classify.
101
        pages_for_hints: Optional list of pages to use for generating font/page hints.
102
            If None, uses `pages`. This allows generating hints from all pages
103
            while only classifying a subset (e.g., when using --pages filter).
104

105
    Returns:
106
        BatchClassificationResult containing per-page results and global histogram
107
    """
108
    # Use all pages for hint generation if provided, otherwise use selected pages
109
    hint_pages = pages_for_hints if pages_for_hints is not None else pages
1✔
110

111
    # Phase 1: Filter duplicate blocks on each page and track removals
112
    duplicate_removals: list[dict[Blocks, Blocks]] = []
1✔
113
    for page_data in pages:
1✔
114
        # Get blocks to keep and mapping of removed blocks
115
        kept_blocks, removed_mapping = filter_duplicate_blocks(page_data.blocks)
1✔
116

117
        logger.debug(
1✔
118
            f"Page {page_data.page_number}: "
119
            f"filtered {len(removed_mapping)} duplicate blocks"
120
        )
121

122
        duplicate_removals.append(removed_mapping)
1✔
123

124
    # Phase 2: Extract font size hints from hint pages (excluding removed blocks)
125
    # Build pages with non-removed blocks for hint extraction and histogram
126

127
    # Filter duplicates from hint pages (may be different from pages to classify)
128
    hint_pages_without_duplicates = []
1✔
129
    for page_data in hint_pages:
1✔
130
        # TODO We are re-filtering duplicates here; optimize by changing the API
131
        # to accept one list of PageData, and seperate by page_numbers.
132
        kept_blocks, _ = filter_duplicate_blocks(page_data.blocks)
1✔
133
        hint_pages_without_duplicates.append(
1✔
134
            PageData(
135
                page_number=page_data.page_number,
136
                bbox=page_data.bbox,
137
                blocks=kept_blocks,
138
            )
139
        )
140

141
    # Build pages without duplicates for classification
142
    pages_without_duplicates = []
1✔
143
    for page_data, removed_mapping in zip(pages, duplicate_removals, strict=True):
1✔
144
        non_removed_blocks = [
1✔
145
            block for block in page_data.blocks if block not in removed_mapping
146
        ]
147
        pages_without_duplicates.append(
1✔
148
            PageData(
149
                page_number=page_data.page_number,
150
                bbox=page_data.bbox,
151
                blocks=non_removed_blocks,
152
            )
153
        )
154

155
    # Generate hints from hint pages, histogram from pages to classify
156
    font_size_hints = FontSizeHints.from_pages(hint_pages_without_duplicates)
1✔
157
    page_hints = PageHintCollection.from_pages(hint_pages_without_duplicates)
1✔
158
    histogram = TextHistogram.from_pages(pages_without_duplicates)
1✔
159

160
    # Phase 3: Classify using the hints (on pages without duplicates)
161
    config = ClassifierConfig(font_size_hints=font_size_hints, page_hints=page_hints)
1✔
162
    classifier = Classifier(config)
1✔
163

164
    results = []
1✔
165
    for page_data, page_without_duplicates, removed_mapping in zip(
1✔
166
        pages, pages_without_duplicates, duplicate_removals, strict=True
167
    ):
168
        # Classify using only non-removed blocks
169
        result = classifier.classify(page_without_duplicates)
1✔
170

171
        # Update result to use original page_data (with all blocks)
172
        result.page_data = page_data
1✔
173

174
        # Mark duplicate blocks as removed
175
        for removed_block, kept_block in removed_mapping.items():
1✔
176
            result.mark_removed(
1✔
177
                removed_block,
178
                RemovalReason(reason_type="duplicate_bbox", target_block=kept_block),
179
            )
180

181
        results.append(result)
1✔
182

183
    return BatchClassificationResult(results=results, histogram=histogram)
1✔
184

185

186
type Classifiers = (
1✔
187
    PageNumberClassifier
188
    | ProgressBarClassifier
189
    | BagNumberClassifier
190
    | PartCountClassifier
191
    | PartNumberClassifier
192
    | StepNumberClassifier
193
    | PieceLengthClassifier
194
    | PartsClassifier
195
    | PartsListClassifier
196
    | PartsImageClassifier
197
    | NewBagClassifier
198
    | DiagramClassifier
199
    | StepClassifier
200
    | PageClassifier
201
)
202

203

204
class Classifier:
1✔
205
    """
206
    Performs a single run of classification based on rules, configuration, and hints.
207
    This class should be stateless.
208
    """
209

210
    def __init__(self, config: ClassifierConfig):
1✔
211
        self.config = config
1✔
212
        # Sort classifiers topologically based on their dependencies
213
        self.classifiers = topological_sort(
1✔
214
            [
215
                PageNumberClassifier(config),
216
                ProgressBarClassifier(config),
217
                BagNumberClassifier(config),
218
                PartCountClassifier(config),
219
                PartNumberClassifier(config),
220
                StepNumberClassifier(config),
221
                PieceLengthClassifier(config),
222
                PartsClassifier(config),
223
                PartsListClassifier(config),
224
                PartsImageClassifier(config),
225
                NewBagClassifier(config),
226
                DiagramClassifier(config),
227
                StepClassifier(config),
228
                PageClassifier(config),
229
            ]
230
        )
231

232
    def classify(self, page_data: PageData) -> ClassificationResult:
1✔
233
        """
234
        Runs the classification logic and returns a result.
235
        It does NOT modify page_data directly.
236

237
        The classification process runs in three phases:
238
        1. Score all classifiers (bottom-up) - auto-registers classifiers
239
        2. Construct final elements (top-down starting from Page)
240
        """
241
        result = ClassificationResult(page_data=page_data)
1✔
242

243
        logger.debug(f"Starting classification for page {page_data.page_number}")
1✔
244

245
        # 1. Score all classifiers (Bottom-Up)
246
        # Note: score() automatically registers each classifier for its output labels
247
        for classifier in self.classifiers:
1✔
248
            classifier.score(result)
1✔
249

250
        # 2. Construct (Top-Down)
251
        # Find the PageClassifier to start the construction process
252
        page_classifier = next(
1✔
253
            c for c in self.classifiers if isinstance(c, PageClassifier)
254
        )
255
        page_classifier.build_all(result)
1✔
256

257
        # TODO Do we actualy ever add warnings?
258
        warnings = self._log_post_classification_warnings(page_data, result)
1✔
259
        for warning in warnings:
1✔
260
            result.add_warning(warning)
1✔
261

262
        return result
1✔
263

264
    def _log_post_classification_warnings(
1✔
265
        self, page_data: PageData, result: ClassificationResult
266
    ) -> list[str]:
267
        warnings = []
1✔
268

269
        # Check if there's a page number
270
        page_numbers = result.get_winners_by_score("page_number", PageNumber)
1✔
271
        if not page_numbers:
1✔
272
            warnings.append(f"Page {page_data.page_number}: missing page number")
1✔
273

274
        # Get elements by label
275
        parts_lists = result.get_winners_by_score("parts_list", PartsList)
1✔
276
        part_counts = result.get_winners_by_score("part_count", PartCount)
1✔
277

278
        for pl in parts_lists:
1✔
279
            inside_counts = [t for t in part_counts if t.bbox.fully_inside(pl.bbox)]
1✔
280
            if not inside_counts:
1✔
UNCOV
281
                warnings.append(
×
282
                    f"Page {page_data.page_number}: parts list at {pl.bbox} "
283
                    f"contains no part counts"
284
                )
285

286
        steps = result.get_winners_by_score("step_number", StepNumber)
1✔
287
        ABOVE_EPS = 2.0
1✔
288
        for step in steps:
1✔
289
            sb = step.bbox
1✔
290
            above = [pl for pl in parts_lists if pl.bbox.y1 <= sb.y0 + ABOVE_EPS]
1✔
291
            if not above:
1✔
292
                warnings.append(
1✔
293
                    f"Page {page_data.page_number}: step number '{step.value}' "
294
                    f"at {sb} has no parts list above it"
295
                )
296
        return warnings
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc