• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 20472199734

23 Dec 2025 09:39PM UTC coverage: 88.693% (+0.2%) from 88.542%
20472199734

push

github

bramp
Add no-orphan constraints for Step child elements

- Add no-orphan constraints to StepClassifier.declare_constraints() for:
  - arrows (point from subassembly callouts to main diagram)
  - rotation_symbols (indicate model rotation)
  - subassemblies (callout boxes within steps)
  - substeps (mini-steps within a main step)
  - diagrams (the main instruction graphic)

- If any of these elements are selected, at least one step must also be
  selected, preventing orphaned elements

- Add unit tests for no-orphan constraint declaration

- Update architecture docs with no-orphan constraint documentation

- Add TODO for potential future centralization in SchemaConstraintGenerator

66 of 67 new or added lines in 2 files covered. (98.51%)

151 existing lines in 8 files now uncovered.

14786 of 16671 relevant lines covered (88.69%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.35
/src/build_a_long/pdf_extract/classifier/classifier.py
1
"""
2
Rule-based classifier for labeling page elements.
3

4
Pipeline order and dependencies
5
--------------------------------
6
The classification pipeline operates in two main phases:
7

8
1. **Bottom-up Scoring**: All classifiers run independently to identify potential
9
   candidates (e.g. page numbers, part counts, step numbers) and score them based
10
   on heuristics. No construction of final elements happens here.
11

12
2. **Top-down Construction**: The root `PageClassifier` is invoked to construct
13
   the final `Page` object. It recursively requests the construction of its
14
   dependencies (e.g. "Give me the best PageNumber"), which in turn construct
15
   their own dependencies. This ensures a consistent and validated object tree.
16

17
"""
18

19
from __future__ import annotations
1✔
20

21
import logging
1✔
22

23
from build_a_long.pdf_extract.classifier.bags import (
1✔
24
    BagNumberClassifier,
25
    LoosePartSymbolClassifier,
26
    OpenBagClassifier,
27
)
28
from build_a_long.pdf_extract.classifier.batch_classification_result import (
1✔
29
    BatchClassificationResult,
30
)
31
from build_a_long.pdf_extract.classifier.block_filter import (
1✔
32
    filter_duplicate_blocks,
33
    filter_overlapping_text_blocks,
34
)
35
from build_a_long.pdf_extract.classifier.candidate import Candidate
1✔
36
from build_a_long.pdf_extract.classifier.classification_result import (
1✔
37
    ClassificationResult,
38
)
39
from build_a_long.pdf_extract.classifier.classifier_config import ClassifierConfig
1✔
40
from build_a_long.pdf_extract.classifier.pages import (
1✔
41
    PageHintCollection,
42
)
43
from build_a_long.pdf_extract.classifier.pages.background_classifier import (
1✔
44
    BackgroundClassifier,
45
)
46
from build_a_long.pdf_extract.classifier.pages.divider_classifier import (
1✔
47
    DividerClassifier,
48
)
49
from build_a_long.pdf_extract.classifier.pages.full_page_background_classifier import (
1✔
50
    FullPageBackgroundClassifier,
51
)
52
from build_a_long.pdf_extract.classifier.pages.info_page_decoration_classifier import (
1✔
53
    InfoPageDecorationClassifier,
54
)
55
from build_a_long.pdf_extract.classifier.pages.page_classifier import PageClassifier
1✔
56
from build_a_long.pdf_extract.classifier.pages.page_edge_classifier import (
1✔
57
    PageEdgeClassifier,
58
)
59
from build_a_long.pdf_extract.classifier.pages.page_number_classifier import (
1✔
60
    PageNumberClassifier,
61
)
62
from build_a_long.pdf_extract.classifier.pages.preview_classifier import (
1✔
63
    PreviewClassifier,
64
)
65
from build_a_long.pdf_extract.classifier.pages.progress_bar_bar_classifier import (
1✔
66
    ProgressBarBarClassifier,
67
)
68
from build_a_long.pdf_extract.classifier.pages.progress_bar_classifier import (
1✔
69
    ProgressBarClassifier,
70
)
71
from build_a_long.pdf_extract.classifier.pages.progress_bar_indicator_classifier import (  # noqa: E501
1✔
72
    ProgressBarIndicatorClassifier,
73
)
74
from build_a_long.pdf_extract.classifier.pages.trivia_text_classifier import (
1✔
75
    TriviaTextClassifier,
76
)
77
from build_a_long.pdf_extract.classifier.parts import (
1✔
78
    PartCountClassifier,
79
    PartNumberClassifier,
80
    PartsClassifier,
81
    PartsImageClassifier,
82
    PartsListClassifier,
83
    PieceLengthClassifier,
84
    ScaleClassifier,
85
    ScaleTextClassifier,
86
    ShineClassifier,
87
)
88
from build_a_long.pdf_extract.classifier.removal_reason import RemovalReason
1✔
89
from build_a_long.pdf_extract.classifier.steps import (
1✔
90
    ArrowClassifier,
91
    DiagramClassifier,
92
    RotationSymbolClassifier,
93
    StepClassifier,
94
    StepCountClassifier,
95
    StepNumberClassifier,
96
    SubAssemblyClassifier,
97
    SubStepClassifier,
98
    SubStepNumberClassifier,
99
)
100
from build_a_long.pdf_extract.classifier.text import FontSizeHints, TextHistogram
1✔
101
from build_a_long.pdf_extract.classifier.topological_sort import topological_sort
1✔
102
from build_a_long.pdf_extract.extractor import PageData
1✔
103
from build_a_long.pdf_extract.extractor.page_blocks import Blocks
1✔
104

105
logger = logging.getLogger(__name__)
1✔
106

107
# Pages with more blocks than this threshold will be skipped during classification.
108
# This avoids O(n²) algorithms (like duplicate detection) that become prohibitively
109
# slow on pages with thousands of vector drawings. Such pages are typically info
110
# pages where each character is a separate vector graphic.
111
# TODO: Add spatial indexing to handle high-block pages efficiently.
112
MAX_BLOCKS_PER_PAGE = 1000
1✔
113

114

115
# TODO require config, so we don't accidentally use default empty config
116
def classify_elements(
1✔
117
    page: PageData, config: ClassifierConfig | None = None
118
) -> ClassificationResult:
119
    """Classify and label elements on a single page using rule-based heuristics.
120

121
    Args:
122
        page: A single PageData object to classify.
123
        config: Optional classifier configuration with font/page hints.
124
            If None, uses default empty configuration (no hints).
125
            For better classification accuracy, pass a config with
126
            FontSizeHints computed from multiple pages of the same PDF.
127

128
    Returns:
129
        A ClassificationResult object containing the classification results.
130
    """
131
    if config is None:
1✔
132
        config = ClassifierConfig()
1✔
133
    classifier = Classifier(config)
1✔
134

135
    return classifier.classify(page)
1✔
136

137

138
def classify_pages(
1✔
139
    pages: list[PageData], pages_for_hints: list[PageData] | None = None
140
) -> BatchClassificationResult:
141
    """Classify and label elements across multiple pages using rule-based heuristics.
142

143
    This function performs a three-phase process:
144
    1. Filtering phase: Mark duplicate/similar blocks as removed on each page
145
    2. Analysis phase: Build font size hints from text properties (excluding
146
       removed blocks)
147
    3. Classification phase: Use hints to guide element classification
148

149
    Args:
150
        pages: A list of PageData objects to classify.
151
        pages_for_hints: Optional list of pages to use for generating font/page hints.
152
            If None, uses `pages`. This allows generating hints from all pages
153
            while only classifying a subset (e.g., when using --pages filter).
154

155
    Returns:
156
        BatchClassificationResult containing per-page results and global histogram
157
    """
158

159
    # TODO There is a bunch of duplication in here between hints and non-hints. Refactor
160

161
    # Use all pages for hint generation if provided, otherwise use selected pages
162
    hint_pages = pages_for_hints if pages_for_hints is not None else pages
1✔
163

164
    # Phase 1: Filter duplicate blocks on each page and track removals
165
    # Skip pages with too many blocks to avoid O(n²) performance issues
166
    removed_blocks_per_page: list[dict[Blocks, RemovalReason]] = []
1✔
167
    skipped_pages: set[int] = set()  # Track page numbers that are skipped
1✔
168

169
    for page_data in pages:
1✔
170
        # Skip pages with too many blocks - these are likely info/inventory pages
171
        # with vectorized text that cause O(n²) algorithms to be very slow
172
        if len(page_data.blocks) > MAX_BLOCKS_PER_PAGE:
1✔
173
            logger.debug(
1✔
174
                f"Page {page_data.page_number}: skipping classification "
175
                f"({len(page_data.blocks)} blocks exceeds threshold of "
176
                f"{MAX_BLOCKS_PER_PAGE})"
177
            )
178
            skipped_pages.add(page_data.page_number)
1✔
179
            removed_blocks_per_page.append({})
1✔
180
            continue
1✔
181

182
        kept_blocks = page_data.blocks
1✔
183

184
        # Filter overlapping text blocks (e.g., "4" and "43" at same origin)
185
        kept_blocks, text_removed = filter_overlapping_text_blocks(kept_blocks)
1✔
186

187
        # Filter duplicate image/drawing blocks based on IOU
188
        kept_blocks, bbox_removed = filter_duplicate_blocks(kept_blocks)
1✔
189

190
        # Combine all removal mappings into a single dict for this page
191
        combined_removed_mapping = {
1✔
192
            **text_removed,
193
            **bbox_removed,
194
        }
195

196
        logger.debug(
1✔
197
            f"Page {page_data.page_number}: "
198
            f"filtered {len(text_removed)} overlapping text, "
199
            f"{len(bbox_removed)} duplicate bbox blocks"
200
        )
201

202
        removed_blocks_per_page.append(combined_removed_mapping)
1✔
203

204
    # Phase 2: Extract font size hints from hint pages (excluding removed blocks)
205
    # Build pages with non-removed blocks for hint extraction and histogram
206

207
    # Filter duplicates from hint pages (may be different from pages to classify)
208
    hint_pages_without_duplicates = []
1✔
209
    for page_data in hint_pages:
1✔
210
        # Skip high-block pages for hints too (same threshold)
211
        if len(page_data.blocks) > MAX_BLOCKS_PER_PAGE:
1✔
212
            continue
1✔
213

214
        # TODO We are re-filtering duplicates here; optimize by changing the API
215
        # to accept one list of PageData, and seperate by page_numbers.
216
        kept_blocks = page_data.blocks
1✔
217
        kept_blocks, _ = filter_overlapping_text_blocks(kept_blocks)
1✔
218
        kept_blocks, _ = filter_duplicate_blocks(kept_blocks)
1✔
219

220
        hint_pages_without_duplicates.append(
1✔
221
            PageData(
222
                page_number=page_data.page_number,
223
                bbox=page_data.bbox,
224
                blocks=kept_blocks,
225
            )
226
        )
227

228
    # Build pages without duplicates for classification
229
    pages_without_duplicates = []
1✔
230
    for page_data, removed_mapping in zip(pages, removed_blocks_per_page, strict=True):
1✔
231
        # We need to filter blocks that were removed by ANY filter
232
        non_removed_blocks = [
1✔
233
            block for block in page_data.blocks if block not in removed_mapping
234
        ]
235
        pages_without_duplicates.append(
1✔
236
            PageData(
237
                page_number=page_data.page_number,
238
                bbox=page_data.bbox,
239
                blocks=non_removed_blocks,
240
            )
241
        )
242

243
    # Generate hints from hint pages, histogram from pages to classify
244
    font_size_hints = FontSizeHints.from_pages(hint_pages_without_duplicates)
1✔
245
    page_hints = PageHintCollection.from_pages(hint_pages_without_duplicates)
1✔
246
    histogram = TextHistogram.from_pages(pages_without_duplicates)
1✔
247

248
    # Phase 3: Classify using the hints (on pages without duplicates)
249
    config = ClassifierConfig(font_size_hints=font_size_hints, page_hints=page_hints)
1✔
250
    classifier = Classifier(config)
1✔
251

252
    results = []
1✔
253
    for page_data, page_without_duplicates, removed_mapping in zip(
1✔
254
        pages, pages_without_duplicates, removed_blocks_per_page, strict=True
255
    ):
256
        # Handle skipped pages
257
        if page_data.page_number in skipped_pages:
1✔
258
            result = ClassificationResult(
1✔
259
                page_data=page_data,
260
                skipped_reason=(
261
                    f"Page has {len(page_data.blocks)} blocks, which exceeds "
262
                    f"the threshold of {MAX_BLOCKS_PER_PAGE}. This is likely an "
263
                    f"info/inventory page with vectorized text."
264
                ),
265
            )
266
            results.append(result)
1✔
267
            continue
1✔
268

269
        # Classify using only non-removed blocks
270
        result = classifier.classify(page_without_duplicates)
1✔
271

272
        # Update result to use original page_data (with all blocks)
273
        result.page_data = page_data
1✔
274

275
        # Mark removed blocks
276
        for removed_block, removal_reason in removed_mapping.items():
1✔
277
            result.mark_removed(removed_block, removal_reason)
1✔
278

279
        results.append(result)
1✔
280

281
    return BatchClassificationResult(results=results, histogram=histogram)
1✔
282

283

284
type Classifiers = (
1✔
285
    PageNumberClassifier
286
    | ProgressBarBarClassifier
287
    | ProgressBarClassifier
288
    | ProgressBarIndicatorClassifier
289
    | PreviewClassifier
290
    | FullPageBackgroundClassifier
291
    | PageEdgeClassifier
292
    | BackgroundClassifier
293
    | DividerClassifier
294
    | InfoPageDecorationClassifier
295
    | BagNumberClassifier
296
    | PartCountClassifier
297
    | PartNumberClassifier
298
    | StepNumberClassifier
299
    | StepCountClassifier
300
    | PieceLengthClassifier
301
    | ScaleClassifier
302
    | ScaleTextClassifier
303
    | PartsClassifier
304
    | PartsListClassifier
305
    | PartsImageClassifier
306
    | ShineClassifier
307
    | OpenBagClassifier
308
    | LoosePartSymbolClassifier
309
    | DiagramClassifier
310
    | ArrowClassifier
311
    | SubAssemblyClassifier
312
    | StepClassifier
313
    | TriviaTextClassifier
314
    | PageClassifier
315
)
316

317

318
class Classifier:
1✔
319
    """Performs a single run of classification based on rules, configuration, and hints.
320

321
    This class orchestrates the two-phase classification process:
322
    1. **Scoring Phase**: All classifiers run `_score()` to create candidates
323
    2. **Construction Phase**: PageClassifier.build_all() triggers top-down construction
324

325
    This class should be stateless.
326

327
    Best Practices for Writing Classifiers
328
    =======================================
329

330
    Phase 1: Scoring (`_score()` method)
331
    ------------------------------------
332

333
    The scoring phase evaluates blocks and creates candidates. Key rules:
334

335
    **Allowed API Access:**
336
    - `result.page_data.blocks` - Access all page blocks
337
    - `result.get_candidates(label)` - Get candidates for a label
338
    - `result.get_scored_candidates(label)` - Get scored candidates (identical to
339
      get_candidates() during scoring phase since nothing is built yet)
340
    - IMPORTANT: Only request candidates for labels in your `requires` frozenset
341

342
    **Scoring Philosophy:**
343
    - Score based on INTRINSIC properties (size, position, text content, color)
344
    - Observe potential relationships to inform score ("could have 3 children")
345
    - DO NOT pre-assign specific child candidates in your scoring logic
346
    - DO NOT check `result.is_consumed()` - that's for the build phase
347

348
    **Score Object Requirements:**
349
    - MUST inherit from the `Score` abstract base class
350
    - SHOULD store candidate references from dependencies (e.g.,
351
      `part_count_candidate: Candidate`)
352
    - Should NOT store Block objects directly
353
    - Reason: Makes it clear if scoring depends on a built candidate or not
354

355
    **Example:**
356

357
    .. code-block:: python
358

359
        class MyScore(Score):
360
            # Score with dependency candidate reference.
361
            intrinsic_score: float
362
            child_candidate: Candidate | None  # OK: Store candidate
363
            # child_block: Block | None  # BAD: Don't store blocks
364

365
            def score(self) -> Weight:
366
                return self.intrinsic_score
367

368
        def _score(self, result: ClassificationResult) -> None:
369
            # Get dependency candidates (only if in self.requires)
370
            child_candidates = result.get_scored_candidates("child_label")
371

372
            for block in result.page_data.blocks:
373
                # Score based on intrinsic properties
374
                intrinsic_score = self._calculate_intrinsic_score(block)
375

376
                # Optional: observe potential children to inform score
377
                best_child = None
378
                if child_candidates:
379
                    best_child = self._find_closest(block, child_candidates)
380
                    if best_child:
381
                        intrinsic_score += 0.2  # Boost if potential child exists
382

383
                score_obj = MyScore(
384
                    intrinsic_score=intrinsic_score,
385
                    child_candidate=best_child,  # Store candidate reference
386
                )
387

388
                result.add_candidate(
389
                    Candidate(
390
                        bbox=block.bbox,
391
                        label=self.output,
392
                        score=score_obj.score(),
393
                        score_details=score_obj,
394
                        source_blocks=[block],
395
                    )
396
                )
397

398
    Phase 2: Construction (`build()` method)
399
    ----------------------------------------
400

401
    The build phase constructs LegoPageElements from winning candidates. Key rules:
402

403
    **Construction Process:**
404
    - Validate that dependency candidates are still valid (not consumed/failed)
405
    - Use `result.build(candidate)` to construct child elements
406
    - Discover relationships at build time (don't rely on pre-scored relationships)
407
    - Check `result.is_consumed()` if searching for available blocks
408

409
    **Source Blocks Rules:**
410
    - A source block should only be assigned to ONE built candidate
411
    - Multiple candidates can reference a block during scoring, but only one builds
412
    - **Non-composite elements**: MUST have 1+ source blocks
413
    - **Composite elements**: MAY have 0+ source blocks (decoration, borders)
414
    - Parent's source_blocks should NOT include child's source_blocks
415

416
    **Exception Handling:**
417
    - Raise `CandidateFailedError` for intentional build failures
418
    - Let other exceptions (TypeError, AttributeError) propagate naturally
419
    - Caller should catch exceptions if element is optional or alternatives exist
420
    - Otherwise, let exceptions bubble up
421

422
    **Example:**
423

424
    .. code-block:: python
425

426
        def build(
427
            self, candidate: Candidate, result: ClassificationResult
428
        ) -> MyElement:
429
            # Construct element from candidate.
430
            score = candidate.score_details
431
            assert isinstance(score, MyScore)
432

433
            # Build child if candidate is still valid
434
            child_elem = None
435
            if score.child_candidate:
436
                try:
437
                    child_elem = result.build(score.child_candidate)
438
                    assert isinstance(child_elem, ChildElement)
439
                except CandidateFailedError:
440
                    # Child failed - either fail or continue without it
441
                    if self._requires_child:
442
                        raise  # Propagate failure
443
                    # Otherwise continue with child_elem = None
444

445
            return MyElement(
446
                bbox=candidate.bbox,
447
                child=child_elem,
448
                # source_blocks inherited from candidate
449
            )
450

451
    Phase 2b: Global Coordination (`build_all()` method)
452
    ----------------------------------------------------
453

454
    Most classifiers use the default `build_all()` which iterates through
455
    candidates and calls `build()` on each. Override when you need:
456

457
    **When to Override:**
458
    - Global optimization (e.g., Hungarian matching to find N best pairings)
459
    - Building multiple candidates with interdependencies
460
    - Pre-build setup that affects all candidates
461

462
    **Key Differences:**
463
    - `build()`: Works in isolation on a single candidate
464
    - `build_all()`: Coordinates multiple candidates globally
465

466
    **Can build_all() call other labels' builds?**
467
    - Technically yes, but best to avoid unless necessary
468
    - Usually each classifier manages only its own label's candidates
469

470
    **Example:**
471

472
    .. code-block:: python
473

474
        def build_all(self, result: ClassificationResult) -> list[LegoPageElements]:
475
            # Build candidates using Hungarian matching.
476
            candidates = result.get_candidates(self.output)
477

478
            # Perform global optimization
479
            best_assignments = self._hungarian_match(candidates)
480

481
            elements = []
482
            for candidate in best_assignments:
483
                try:
484
                    elem = result.build(candidate)
485
                    elements.append(elem)
486
                except CandidateFailedError as e:
487
                    log.debug(f\"Failed to build {candidate.label}: {e}\")
488
                    log.debug(f"Failed to build {candidate.label}: {e}")
489

490
            return elements
491

492
    Common Patterns
493
    ---------------
494

495
    **Pattern 1: Atomic Classifier (single block → element)**
496

497
    **Recommendation**: Use `RuleBasedClassifier` for most atomic classifiers.
498
    It provides a declarative, maintainable way to score blocks using composable
499
    rules. Only implement custom `_score()` logic when you need complex pairing
500
    or non-standard scoring that can't be expressed with rules.
501

502
    .. code-block:: python
503

504
        class MyAtomicClassifier(RuleBasedClassifier):
505
            output = "my_label"
506
            requires = frozenset()  # No dependencies
507

508
            @property
509
            def rules(self) -> Sequence[Rule]:
510
                return [
511
                    IsInstanceFilter((Text,)),
512
                    PositionScore(...),
513
                    # ... more rules
514
                ]
515

516
            def build(self, candidate, result) -> MyElement:
517
                return MyElement(bbox=candidate.bbox)
518

519
    **Pattern 2: Composite Classifier (combines other elements)**
520

521
    .. code-block:: python
522

523
        class MyCompositeClassifier(LabelClassifier):
524
            output = "my_composite"
525
            requires = frozenset({"child1", "child2"})
526

527
            def _score(self, result):
528
                child1_cands = result.get_scored_candidates("child1")
529
                child2_cands = result.get_scored_candidates("child2")
530

531
                # Create composite candidates by pairing children
532
                for c1 in child1_cands:
533
                    for c2 in child2_cands:
534
                        if self._are_related(c1, c2):
535
                            score = self._compute_pair_score(c1, c2)
536
                            result.add_candidate(
537
                                Candidate(
538
                                    bbox=BBox.union(c1.bbox, c2.bbox),
539
                                    label=self.output,
540
                                    score=score.score(),
541
                                    score_details=score,
542
                                    source_blocks=[],  # Composite
543
                                )
544
                            )
545

546
            def build(self, candidate, result) -> MyComposite:
547
                score = candidate.score_details
548
                child1 = result.build(score.child1_candidate)
549
                child2 = result.build(score.child2_candidate)
550
                return MyComposite(bbox=candidate.bbox, c1=child1, c2=child2)
551

552
    See Also
553
    --------
554
    - classifier/DESIGN.md: Architectural principles
555
    - classifier/README.md: Classification pipeline overview
556
    - LabelClassifier: Base class for all classifiers
557
    - RuleBasedClassifier: Rule-based classifier base class
558
    """
559

560
    # Labels for which the constraint solver is enabled by default.
561
    # The CP-SAT solver ensures consistent parent-child relationships between
562
    # elements (e.g., parts_list contains parts, parts have part_count/part_number).
563
    DEFAULT_SOLVER_LABELS: frozenset[str] = frozenset(
1✔
564
        {
565
            # Parts-related labels
566
            "parts_list",
567
            "part",
568
            "part_count",
569
            "part_image",
570
            "part_number",
571
            # Page-level singleton elements
572
            "page_number",
573
            "progress_bar",
574
            "progress_bar_bar",
575
            "progress_bar_indicator",
576
            "background",
577
            # Dividers (block exclusivity via solver)
578
            "divider",
579
            # Step-related labels (uniqueness by step_value)
580
            "step",
581
            "step_number",
582
        }
583
    )
584

585
    def __init__(
1✔
586
        self,
587
        config: ClassifierConfig,
588
        use_constraint_solver: bool = False,
589
        use_solver_for: set[str] | None = None,
590
    ):
591
        """Initialize the classifier with optional constraint solver.
592

593
        Args:
594
            config: Classifier configuration with hints and settings
595
            use_constraint_solver: If True, use CP-SAT solver globally for all labels.
596
                Takes precedence over use_solver_for if both are specified.
597
            use_solver_for: Set of labels to solve with CP-SAT (e.g., {"parts_list"}).
598
                If None, defaults to DEFAULT_SOLVER_LABELS for parts-related labels.
599
                Pass an empty set() to disable the solver entirely.
600
        """
601
        self.config = config
1✔
602
        self.use_constraint_solver = use_constraint_solver
1✔
603
        # Default to parts labels if not specified (None)
604
        # Use empty set only if explicitly passed as set()
605
        if use_solver_for is None:
1✔
606
            self.use_solver_for = set(self.DEFAULT_SOLVER_LABELS)
1✔
607
        else:
UNCOV
608
            self.use_solver_for = use_solver_for
×
609

610
        # Sort classifiers topologically based on their dependencies
611
        self.classifiers = topological_sort(
1✔
612
            [
613
                PageNumberClassifier(config=config),
614
                ProgressBarIndicatorClassifier(config=config),
615
                ProgressBarBarClassifier(config=config),
616
                ProgressBarClassifier(config=config),
617
                FullPageBackgroundClassifier(config=config),
618
                PageEdgeClassifier(config=config),
619
                BackgroundClassifier(config=config),
620
                DividerClassifier(config=config),
621
                InfoPageDecorationClassifier(config=config),
622
                BagNumberClassifier(config=config),
623
                PartCountClassifier(config=config),
624
                PartNumberClassifier(config=config),
625
                StepNumberClassifier(config=config),
626
                SubStepNumberClassifier(config=config),
627
                StepCountClassifier(config=config),
628
                PieceLengthClassifier(config=config),
629
                ScaleTextClassifier(config=config),
630
                ScaleClassifier(config=config),
631
                PartsClassifier(config=config),
632
                PartsListClassifier(config=config),
633
                DiagramClassifier(config=config),
634
                RotationSymbolClassifier(config=config),
635
                ArrowClassifier(config=config),
636
                PartsImageClassifier(config=config),
637
                ShineClassifier(config=config),
638
                OpenBagClassifier(config=config),
639
                LoosePartSymbolClassifier(config=config),
640
                PreviewClassifier(config=config),
641
                SubStepClassifier(config=config),
642
                SubAssemblyClassifier(config=config),
643
                StepClassifier(config=config),
644
                TriviaTextClassifier(config=config),
645
                PageClassifier(config=config),
646
            ]
647
        )
648

649
    def classify(self, page_data: PageData) -> ClassificationResult:
1✔
650
        """
651
        Runs the classification logic and returns a result.
652
        It does NOT modify page_data directly.
653

654
        The classification process runs in three phases:
655
        1. Score all classifiers (bottom-up) - auto-registers classifiers
656
        2. [Optional] Run constraint solver to select candidates
657
        3. Construct final elements (top-down starting from Page)
658
        """
659
        result = ClassificationResult(page_data=page_data)
1✔
660

661
        logger.debug(f"Starting classification for page {page_data.page_number}")
1✔
662

663
        # 1. Score all classifiers (Bottom-Up)
664
        # Note: score() automatically registers each classifier for its output labels
665
        for classifier in self.classifiers:
1✔
666
            classifier.score(result)
1✔
667

668
        # 2. [Optional] Run constraint solver to select candidates
669
        if self.use_constraint_solver or self.use_solver_for:
1✔
670
            self._run_constraint_solver(result)
1✔
671

672
        # 3. Construct (Top-Down)
673
        # Find the PageClassifier to start the construction process
674
        page_classifier = next(
1✔
675
            c for c in self.classifiers if isinstance(c, PageClassifier)
676
        )
677
        page_classifier.build_all(result)
1✔
678

679
        # 4. Validate classification invariants
680
        self._validate_classification_result(result)
1✔
681

682
        return result
1✔
683

684
    def _run_constraint_solver(self, result: ClassificationResult) -> None:
1✔
685
        """Run CP-SAT constraint solver to select candidates.
686

687
        This method:
688
        1. Creates a ConstraintModel and adds all candidates
689
        2. Calls declare_constraints() on each classifier
690
        3. Runs auto-generation of schema-based constraints
691
        4. Solves the constraint problem
692
        5. Marks selected candidates in the result
693

694
        Args:
695
            result: The classification result with scored candidates
696
        """
697
        from build_a_long.pdf_extract.classifier.constraint_model import (  # noqa: PLC0415
1✔
698
            ConstraintModel,
699
        )
700
        from build_a_long.pdf_extract.classifier.schema_constraint_generator import (  # noqa: PLC0415
1✔
701
            SchemaConstraintGenerator,
702
        )
703

704
        logger.debug(
1✔
705
            f"Running constraint solver for page {result.page_data.page_number}"
706
        )
707

708
        # Create constraint model
709
        model = ConstraintModel()
1✔
710

711
        # Determine which labels to include
712
        if self.use_constraint_solver:
1✔
713
            # Use solver for all labels
UNCOV
714
            labels_to_solve = set(result.candidates.keys())
×
715
        else:
716
            # Use solver only for specified labels
717
            labels_to_solve = self.use_solver_for
1✔
718

719
        # Add candidates to the model (filtered by label if needed)
720
        all_candidates: list[Candidate] = []
1✔
721
        for label, candidates in result.candidates.items():
1✔
722
            if label in labels_to_solve:
1✔
723
                for candidate in candidates:
1✔
724
                    model.add_candidate(candidate)
1✔
725
                    all_candidates.append(candidate)
1✔
726

727
        logger.debug(
1✔
728
            f"  Added {len(all_candidates)} candidates "
729
            f"for labels: {sorted(labels_to_solve)}"
730
        )
731

732
        # Add block exclusivity constraints
733
        # (candidates sharing blocks are mutually exclusive)
734
        model.add_block_exclusivity_constraints(all_candidates)
1✔
735

736
        # Let each classifier declare custom constraints
737
        for classifier in self.classifiers:
1✔
738
            classifier.declare_constraints(model, result)
1✔
739

740
        # Auto-generate schema-based constraints
741
        generator = SchemaConstraintGenerator()
1✔
742
        for classifier in self.classifiers:
1✔
743
            generator.generate_for_classifier(classifier, model, result)
1✔
744

745
        # Add child uniqueness constraints (each child has at most one parent)
746
        generator.add_child_uniqueness_constraints(model)
1✔
747

748
        # Maximize total score (scale float scores 0.0-1.0 to int weights 0-1000)
749
        model.maximize([(cand, int(cand.score * 1000)) for cand in all_candidates])
1✔
750

751
        # Solve
752
        solved, selection = model.solve()
1✔
753

754
        if not solved:
1✔
UNCOV
755
            logger.warning(
×
756
                f"Constraint solver failed for page {result.page_data.page_number}, "
757
                "falling back to empty selection"
758
            )
UNCOV
759
            result.set_solver_selection([], labels_to_solve)
×
UNCOV
760
            return
×
761

762
        # Mark selected candidates
763
        selected_candidates = [
1✔
764
            cand for cand in all_candidates if selection.get(cand.id, False)
765
        ]
766
        logger.debug(
1✔
767
            f"  Solver selected {len(selected_candidates)}/"
768
            f"{len(all_candidates)} candidates"
769
        )
770
        result.set_solver_selection(selected_candidates, labels_to_solve)
1✔
771

772
    def _validate_classification_result(self, result: ClassificationResult) -> None:
1✔
773
        """Validate classification invariants and catch programming errors.
774

775
        This method runs assertions to verify that the classification process
776
        produced a consistent and valid result. These checks catch bugs in
777
        classifier code where elements are incorrectly constructed or tracked.
778

779
        Validations performed:
780
        - All page elements are tracked via candidates (not created directly)
781
        - All constructed elements appear in the Page hierarchy (no orphans)
782
        - Element bboxes match the union of source blocks + child elements
783

784
        Args:
785
            result: The classification result to validate
786

787
        Raises:
788
            AssertionError: If any invariant is violated
789
        """
790
        # Import here to avoid circular dependency:
791
        # - classifier.py imports validation.rules
792
        # - validation.rules imports ClassificationResult from classifier
793
        # By importing at runtime (inside this method), both modules are fully
794
        # loaded before the import executes, avoiding the circular import error.
795
        from build_a_long.pdf_extract.validation.rules import (  # noqa: PLC0415
1✔
796
            assert_constructed_elements_on_page,
797
            assert_element_bbox_matches_source_and_children,
798
            assert_no_shared_source_blocks,
799
            # TODO: Re-enable once Part elements go through candidate system
800
            # assert_page_elements_tracked,
801
        )
802

803
        # TODO: Re-enable once Part elements go through candidate system
804
        # assert_page_elements_tracked(result)
805
        assert_constructed_elements_on_page(result)
1✔
806
        assert_element_bbox_matches_source_and_children(result)
1✔
807
        assert_no_shared_source_blocks(result)
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc