• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 20086551557

10 Dec 2025 03:43AM UTC coverage: 90.303% (+0.3%) from 90.041%
20086551557

push

github

bramp
Refactor arrow shaft detection: unified method, stroked line support, multi-head grouping

- Merge _find_simple_shaft, _find_stroked_line_shaft, and _find_cornered_shaft
  into a single unified _find_shaft method that handles all shaft types by
  extracting points and finding closest/furthest from the arrowhead tip
- Add support for stroked line shafts (stroke_color instead of fill_color)
- Add tail_grouping_tolerance config for grouping arrowheads with nearby tails
- Group arrowheads that share the same shaft_block (L-shaped arrows with
  multiple heads at different ends)
- Use union-find algorithm to group arrowheads by shared shaft or tail proximity
- Extract colors_match to shared utils module
- Add comprehensive tests for stroked line shafts, tail correctness, and
  multi-head arrow grouping
- Update golden files for pages 011, 013, 015, 017 with corrected arrow detection

204 of 206 new or added lines in 5 files covered. (99.03%)

252 existing lines in 14 files now uncovered.

11855 of 13128 relevant lines covered (90.3%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.11
/src/build_a_long/pdf_extract/classifier/classifier.py
1
"""
2
Rule-based classifier for labeling page elements.
3

4
Pipeline order and dependencies
5
--------------------------------
6
The classification pipeline operates in two main phases:
7

8
1. **Bottom-up Scoring**: All classifiers run independently to identify potential
9
   candidates (e.g. page numbers, part counts, step numbers) and score them based
10
   on heuristics. No construction of final elements happens here.
11

12
2. **Top-down Construction**: The root `PageClassifier` is invoked to construct
13
   the final `Page` object. It recursively requests the construction of its
14
   dependencies (e.g. "Give me the best PageNumber"), which in turn construct
15
   their own dependencies. This ensures a consistent and validated object tree.
16

17
"""
18

19
from __future__ import annotations
1✔
20

21
import logging
1✔
22

23
from build_a_long.pdf_extract.classifier.bags import (
1✔
24
    BagNumberClassifier,
25
    NewBagClassifier,
26
)
27
from build_a_long.pdf_extract.classifier.batch_classification_result import (
1✔
28
    BatchClassificationResult,
29
)
30
from build_a_long.pdf_extract.classifier.block_filter import (
1✔
31
    filter_duplicate_blocks,
32
    filter_overlapping_text_blocks,
33
)
34
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
35
    ClassificationResult,
36
)
37
from build_a_long.pdf_extract.classifier.classifier_config import ClassifierConfig
1✔
38
from build_a_long.pdf_extract.classifier.pages import (
1✔
39
    PageHintCollection,
40
)
41
from build_a_long.pdf_extract.classifier.pages.background_classifier import (
1✔
42
    BackgroundClassifier,
43
)
44
from build_a_long.pdf_extract.classifier.pages.divider_classifier import (
1✔
45
    DividerClassifier,
46
)
47
from build_a_long.pdf_extract.classifier.pages.page_classifier import PageClassifier
1✔
48
from build_a_long.pdf_extract.classifier.pages.page_number_classifier import (
1✔
49
    PageNumberClassifier,
50
)
51
from build_a_long.pdf_extract.classifier.pages.preview_classifier import (
1✔
52
    PreviewClassifier,
53
)
54
from build_a_long.pdf_extract.classifier.pages.progress_bar_classifier import (
1✔
55
    ProgressBarClassifier,
56
)
57
from build_a_long.pdf_extract.classifier.pages.progress_bar_indicator_classifier import (
1✔
58
    ProgressBarIndicatorClassifier,
59
)
60
from build_a_long.pdf_extract.classifier.pages.trivia_text_classifier import (
1✔
61
    TriviaTextClassifier,
62
)
63
from build_a_long.pdf_extract.classifier.parts import (
1✔
64
    PartCountClassifier,
65
    PartNumberClassifier,
66
    PartsClassifier,
67
    PartsImageClassifier,
68
    PartsListClassifier,
69
    PieceLengthClassifier,
70
    ShineClassifier,
71
)
72
from build_a_long.pdf_extract.classifier.removal_reason import RemovalReason
1✔
73
from build_a_long.pdf_extract.classifier.steps import (
1✔
74
    ArrowClassifier,
75
    DiagramClassifier,
76
    RotationSymbolClassifier,
77
    StepClassifier,
78
    StepCountClassifier,
79
    StepNumberClassifier,
80
    SubAssemblyClassifier,
81
)
82
from build_a_long.pdf_extract.classifier.text import FontSizeHints, TextHistogram
1✔
83
from build_a_long.pdf_extract.classifier.topological_sort import topological_sort
1✔
84
from build_a_long.pdf_extract.extractor import PageData
1✔
85
from build_a_long.pdf_extract.extractor.bbox import filter_contained
1✔
86
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
87
    PageNumber,
88
    PartCount,
89
    PartsList,
90
    StepNumber,
91
)
92
from build_a_long.pdf_extract.extractor.page_blocks import Blocks
1✔
93

94
logger = logging.getLogger(__name__)
1✔
95

96
# Pages with more blocks than this threshold will be skipped during classification.
97
# This avoids O(n²) algorithms (like duplicate detection) that become prohibitively
98
# slow on pages with thousands of vector drawings. Such pages are typically info
99
# pages where each character is a separate vector graphic.
100
# TODO: Add spatial indexing to handle high-block pages efficiently.
101
MAX_BLOCKS_PER_PAGE = 1000
1✔
102

103

104
# TODO require config, so we don't accidentally use default empty config
105
def classify_elements(
1✔
106
    page: PageData, config: ClassifierConfig | None = None
107
) -> ClassificationResult:
108
    """Classify and label elements on a single page using rule-based heuristics.
109

110
    Args:
111
        page: A single PageData object to classify.
112
        config: Optional classifier configuration with font/page hints.
113
            If None, uses default empty configuration (no hints).
114
            For better classification accuracy, pass a config with
115
            FontSizeHints computed from multiple pages of the same PDF.
116

117
    Returns:
118
        A ClassificationResult object containing the classification results.
119
    """
120
    if config is None:
1✔
121
        config = ClassifierConfig()
1✔
122
    classifier = Classifier(config)
1✔
123

124
    return classifier.classify(page)
1✔
125

126

127
def classify_pages(
1✔
128
    pages: list[PageData], pages_for_hints: list[PageData] | None = None
129
) -> BatchClassificationResult:
130
    """Classify and label elements across multiple pages using rule-based heuristics.
131

132
    This function performs a three-phase process:
133
    1. Filtering phase: Mark duplicate/similar blocks as removed on each page
134
    2. Analysis phase: Build font size hints from text properties (excluding
135
       removed blocks)
136
    3. Classification phase: Use hints to guide element classification
137

138
    Args:
139
        pages: A list of PageData objects to classify.
140
        pages_for_hints: Optional list of pages to use for generating font/page hints.
141
            If None, uses `pages`. This allows generating hints from all pages
142
            while only classifying a subset (e.g., when using --pages filter).
143

144
    Returns:
145
        BatchClassificationResult containing per-page results and global histogram
146
    """
147

148
    # TODO There is a bunch of duplication in here between hints and non-hints. Refactor
149

150
    # Use all pages for hint generation if provided, otherwise use selected pages
151
    hint_pages = pages_for_hints if pages_for_hints is not None else pages
1✔
152

153
    # Phase 1: Filter duplicate blocks on each page and track removals
154
    # Skip pages with too many blocks to avoid O(n²) performance issues
155
    removed_blocks_per_page: list[dict[Blocks, RemovalReason]] = []
1✔
156
    skipped_pages: set[int] = set()  # Track page numbers that are skipped
1✔
157

158
    for page_data in pages:
1✔
159
        # Skip pages with too many blocks - these are likely info/inventory pages
160
        # with vectorized text that cause O(n²) algorithms to be very slow
161
        if len(page_data.blocks) > MAX_BLOCKS_PER_PAGE:
1✔
162
            logger.debug(
1✔
163
                f"Page {page_data.page_number}: skipping classification "
164
                f"({len(page_data.blocks)} blocks exceeds threshold of "
165
                f"{MAX_BLOCKS_PER_PAGE})"
166
            )
167
            skipped_pages.add(page_data.page_number)
1✔
168
            removed_blocks_per_page.append({})
1✔
169
            continue
1✔
170

171
        kept_blocks = page_data.blocks
1✔
172

173
        # Filter overlapping text blocks (e.g., "4" and "43" at same origin)
174
        kept_blocks, text_removed = filter_overlapping_text_blocks(kept_blocks)
1✔
175

176
        # Filter duplicate image/drawing blocks based on IOU
177
        kept_blocks, bbox_removed = filter_duplicate_blocks(kept_blocks)
1✔
178

179
        # Combine all removal mappings into a single dict for this page
180
        combined_removed_mapping = {
1✔
181
            **text_removed,
182
            **bbox_removed,
183
        }
184

185
        logger.debug(
1✔
186
            f"Page {page_data.page_number}: "
187
            f"filtered {len(text_removed)} overlapping text, "
188
            f"{len(bbox_removed)} duplicate bbox blocks"
189
        )
190

191
        removed_blocks_per_page.append(combined_removed_mapping)
1✔
192

193
    # Phase 2: Extract font size hints from hint pages (excluding removed blocks)
194
    # Build pages with non-removed blocks for hint extraction and histogram
195

196
    # Filter duplicates from hint pages (may be different from pages to classify)
197
    hint_pages_without_duplicates = []
1✔
198
    for page_data in hint_pages:
1✔
199
        # Skip high-block pages for hints too (same threshold)
200
        if len(page_data.blocks) > MAX_BLOCKS_PER_PAGE:
1✔
201
            continue
1✔
202

203
        # TODO We are re-filtering duplicates here; optimize by changing the API
204
        # to accept one list of PageData, and seperate by page_numbers.
205
        kept_blocks = page_data.blocks
1✔
206
        kept_blocks, _ = filter_overlapping_text_blocks(kept_blocks)
1✔
207
        kept_blocks, _ = filter_duplicate_blocks(kept_blocks)
1✔
208

209
        hint_pages_without_duplicates.append(
1✔
210
            PageData(
211
                page_number=page_data.page_number,
212
                bbox=page_data.bbox,
213
                blocks=kept_blocks,
214
            )
215
        )
216

217
    # Build pages without duplicates for classification
218
    pages_without_duplicates = []
1✔
219
    for page_data, removed_mapping in zip(pages, removed_blocks_per_page, strict=True):
1✔
220
        # We need to filter blocks that were removed by ANY filter
221
        non_removed_blocks = [
1✔
222
            block for block in page_data.blocks if block not in removed_mapping
223
        ]
224
        pages_without_duplicates.append(
1✔
225
            PageData(
226
                page_number=page_data.page_number,
227
                bbox=page_data.bbox,
228
                blocks=non_removed_blocks,
229
            )
230
        )
231

232
    # Generate hints from hint pages, histogram from pages to classify
233
    font_size_hints = FontSizeHints.from_pages(hint_pages_without_duplicates)
1✔
234
    page_hints = PageHintCollection.from_pages(hint_pages_without_duplicates)
1✔
235
    histogram = TextHistogram.from_pages(pages_without_duplicates)
1✔
236

237
    # Phase 3: Classify using the hints (on pages without duplicates)
238
    config = ClassifierConfig(font_size_hints=font_size_hints, page_hints=page_hints)
1✔
239
    classifier = Classifier(config)
1✔
240

241
    results = []
1✔
242
    for page_data, page_without_duplicates, removed_mapping in zip(
1✔
243
        pages, pages_without_duplicates, removed_blocks_per_page, strict=True
244
    ):
245
        # Handle skipped pages
246
        if page_data.page_number in skipped_pages:
1✔
247
            result = ClassificationResult(
1✔
248
                page_data=page_data,
249
                skipped_reason=(
250
                    f"Page has {len(page_data.blocks)} blocks, which exceeds "
251
                    f"the threshold of {MAX_BLOCKS_PER_PAGE}. This is likely an "
252
                    f"info/inventory page with vectorized text."
253
                ),
254
            )
255
            results.append(result)
1✔
256
            continue
1✔
257

258
        # Classify using only non-removed blocks
259
        result = classifier.classify(page_without_duplicates)
1✔
260

261
        # Update result to use original page_data (with all blocks)
262
        result.page_data = page_data
1✔
263

264
        # Mark removed blocks
265
        for removed_block, removal_reason in removed_mapping.items():
1✔
266
            result.mark_removed(removed_block, removal_reason)
1✔
267

268
        results.append(result)
1✔
269

270
    return BatchClassificationResult(results=results, histogram=histogram)
1✔
271

272

273
type Classifiers = (
1✔
274
    PageNumberClassifier
275
    | ProgressBarClassifier
276
    | ProgressBarIndicatorClassifier
277
    | PreviewClassifier
278
    | BackgroundClassifier
279
    | DividerClassifier
280
    | BagNumberClassifier
281
    | PartCountClassifier
282
    | PartNumberClassifier
283
    | StepNumberClassifier
284
    | StepCountClassifier
285
    | PieceLengthClassifier
286
    | PartsClassifier
287
    | PartsListClassifier
288
    | PartsImageClassifier
289
    | ShineClassifier
290
    | NewBagClassifier
291
    | DiagramClassifier
292
    | ArrowClassifier
293
    | SubAssemblyClassifier
294
    | StepClassifier
295
    | TriviaTextClassifier
296
    | PageClassifier
297
)
298

299

300
class Classifier:
1✔
301
    """
302
    Performs a single run of classification based on rules, configuration, and hints.
303
    This class should be stateless.
304
    """
305

306
    def __init__(self, config: ClassifierConfig):
1✔
307
        self.config = config
1✔
308
        # Sort classifiers topologically based on their dependencies
309
        self.classifiers = topological_sort(
1✔
310
            [
311
                PageNumberClassifier(config=config),
312
                ProgressBarIndicatorClassifier(config=config),
313
                ProgressBarClassifier(config=config),
314
                BackgroundClassifier(config=config),
315
                DividerClassifier(config=config),
316
                BagNumberClassifier(config=config),
317
                PartCountClassifier(config=config),
318
                PartNumberClassifier(config=config),
319
                StepNumberClassifier(config=config),
320
                StepCountClassifier(config=config),
321
                PieceLengthClassifier(config=config),
322
                PartsClassifier(config=config),
323
                PartsListClassifier(config=config),
324
                DiagramClassifier(config=config),
325
                RotationSymbolClassifier(config=config),
326
                ArrowClassifier(config=config),
327
                PartsImageClassifier(config=config),
328
                ShineClassifier(config=config),
329
                NewBagClassifier(config=config),
330
                PreviewClassifier(config=config),
331
                SubAssemblyClassifier(config=config),
332
                StepClassifier(config=config),
333
                TriviaTextClassifier(config=config),
334
                PageClassifier(config=config),
335
            ]
336
        )
337

338
    def classify(self, page_data: PageData) -> ClassificationResult:
1✔
339
        """
340
        Runs the classification logic and returns a result.
341
        It does NOT modify page_data directly.
342

343
        The classification process runs in three phases:
344
        1. Score all classifiers (bottom-up) - auto-registers classifiers
345
        2. Construct final elements (top-down starting from Page)
346
        """
347
        result = ClassificationResult(page_data=page_data)
1✔
348

349
        logger.debug(f"Starting classification for page {page_data.page_number}")
1✔
350

351
        # 1. Score all classifiers (Bottom-Up)
352
        # Note: score() automatically registers each classifier for its output labels
353
        for classifier in self.classifiers:
1✔
354
            classifier.score(result)
1✔
355

356
        # 2. Construct (Top-Down)
357
        # Find the PageClassifier to start the construction process
358
        page_classifier = next(
1✔
359
            c for c in self.classifiers if isinstance(c, PageClassifier)
360
        )
361
        page_classifier.build_all(result)
1✔
362

363
        # TODO Do we actualy ever add warnings?
364
        warnings = self._log_post_classification_warnings(page_data, result)
1✔
365
        for warning in warnings:
1✔
366
            result.add_warning(warning)
1✔
367

368
        return result
1✔
369

370
    def _log_post_classification_warnings(
1✔
371
        self, page_data: PageData, result: ClassificationResult
372
    ) -> list[str]:
373
        warnings = []
1✔
374

375
        # Check if there's a page number
376
        page_numbers = result.get_winners_by_score("page_number", PageNumber)
1✔
377
        if not page_numbers:
1✔
378
            warnings.append(f"Page {page_data.page_number}: missing page number")
1✔
379

380
        # Get elements by label
381
        parts_lists = result.get_winners_by_score("parts_list", PartsList)
1✔
382
        part_counts = result.get_winners_by_score("part_count", PartCount)
1✔
383

384
        for pl in parts_lists:
1✔
385
            inside_counts = filter_contained(part_counts, pl.bbox)
1✔
386
            if not inside_counts:
1✔
UNCOV
387
                warnings.append(
×
388
                    f"Page {page_data.page_number}: parts list at {pl.bbox} "
389
                    f"contains no part counts"
390
                )
391

392
        steps = result.get_winners_by_score("step_number", StepNumber)
1✔
393
        ABOVE_EPS = 2.0
1✔
394
        for step in steps:
1✔
395
            sb = step.bbox
1✔
396
            above = [pl for pl in parts_lists if pl.bbox.y1 <= sb.y0 + ABOVE_EPS]
1✔
397
            if not above:
1✔
398
                warnings.append(
1✔
399
                    f"Page {page_data.page_number}: step number '{step.value}' "
400
                    f"at {sb} has no parts list above it"
401
                )
402
        return warnings
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc