• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 20398712053

20 Dec 2025 07:00PM UTC coverage: 89.361% (+0.2%) from 89.185%
20398712053

push

github

bramp
Improve circular dependency error to show dependency chain

- Add _find_dependency_cycle() to trace and format the actual circular dependency path
- Update error message to include both affected classifiers and the dependency chain
- Add test case to verify circular dependency detection and error message format

48 of 56 new or added lines in 2 files covered. (85.71%)

145 existing lines in 28 files now uncovered.

13700 of 15331 relevant lines covered (89.36%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

91.51
/src/build_a_long/pdf_extract/classifier/classification_result.py
1
"""ClassificationResult class for single page classification."""
2

3
from __future__ import annotations
1✔
4

5
import logging
1✔
6
from collections.abc import Sequence
1✔
7
from typing import TYPE_CHECKING, Any
1✔
8

9
from pydantic import BaseModel, Field, PrivateAttr, model_validator
1✔
10

11
from build_a_long.pdf_extract.classifier.candidate import Candidate
1✔
12
from build_a_long.pdf_extract.classifier.removal_reason import RemovalReason
1✔
13
from build_a_long.pdf_extract.extractor.extractor import PageData
1✔
14
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
15
    LegoPageElements,
16
    Page,
17
)
18
from build_a_long.pdf_extract.extractor.page_blocks import Blocks
1✔
19

20
if TYPE_CHECKING:
21
    from build_a_long.pdf_extract.classifier.label_classifier import LabelClassifier
22

23
log = logging.getLogger(__name__)
1✔
24

25
# Score key can be either a single Block or a tuple of Blocks (for pairings)
26
ScoreKey = Blocks | tuple[Blocks, ...]
1✔
27

28

29
class CandidateFailedError(Exception):
1✔
30
    """Raised when a candidate cannot be built due to a failure.
31

32
    This exception carries information about which candidate failed,
33
    allowing callers to potentially create replacement candidates and retry.
34
    """
35

36
    def __init__(self, candidate: Candidate, message: str):
1✔
37
        super().__init__(message)
1✔
38
        self.candidate = candidate
1✔
39

40

41
class _BuildSnapshot(BaseModel):
1✔
42
    """Snapshot of candidate and consumed block state for rollback.
43

44
    This is used to implement transactional semantics in build():
45
    if a classifier build fails, we can restore the state as if
46
    the build never started.
47
    """
48

49
    model_config = {"frozen": True}
1✔
50

51
    # Map candidate id -> (constructed value, failure_reason)
52
    candidate_states: dict[int, tuple[LegoPageElements | None, str | None]]
1✔
53
    # Set of consumed block IDs
54
    consumed_blocks: set[int]
1✔
55

56

57
class ClassificationResult(BaseModel):
1✔
58
    """Result of classifying a single page.
59

60
    This class stores both the results and intermediate artifacts for a page
61
    classification. It provides structured access to:
62
    - Labels assigned to blocks
63
    - LegoPageElements constructed from blocks
64
    - Removal reasons for filtered blocks
65
    - All candidates considered (including rejected ones)
66

67
    The use of dictionaries keyed by block IDs (int) instead of Block objects
68
    ensures JSON serializability and consistent equality semantics.
69

70
    # TODO: Consider refactoring to separate DAO (Data Access Object) representation
71
    # from the business logic. The public fields below are used for serialization
72
    # but external code should prefer using the accessor methods to maintain
73
    # encapsulation and allow future refactoring.
74

75
    External code should use the accessor methods rather than accessing these
76
    fields directly to maintain encapsulation.
77
    """
78

79
    page_data: PageData
1✔
80
    """The original page data being classified"""
1✔
81

82
    skipped_reason: str | None = None
1✔
83
    """If set, classification was skipped for this page.
1✔
84

85
    This is used for pages that cannot be reasonably classified, such as:
86
    - Pages with too many blocks (e.g., >1000 vector drawings)
87
    - Info/inventory pages where each character is a separate vector
88

89
    When set, most classification results will be empty.
90
    """
91

92
    removal_reasons: dict[int, RemovalReason] = Field(default_factory=dict)
1✔
93
    """Maps block IDs (block.id, not id(block)) to the reason they were removed.
1✔
94

95
    Keys are block IDs (int) instead of Block objects to ensure JSON serializability
96
    and consistency with constructed_elements.
97

98
    Public for serialization. Prefer using accessor methods.
99
    """
100

101
    candidates: dict[str, list[Candidate]] = Field(default_factory=dict)
1✔
102
    """Maps label names to lists of all candidates considered for that label.
1✔
103

104
    Each candidate includes:
105
    - The source element
106
    - Its score and score details
107
    - The constructed LegoPageElement (if successful)
108
    - Failure reason (if construction failed)
109

110
    This enables:
111
    - Re-evaluation with hints (exclude specific candidates)
112
    - Debugging (see why each candidate won/lost)
113
    - UI support (show users alternatives)
114

115
    Public for serialization. Prefer using get_* accessor methods.
116
    """
117

118
    _classifiers: dict[str, LabelClassifier] = PrivateAttr(default_factory=dict)
1✔
119
    _consumed_blocks: set[int] = PrivateAttr(default_factory=set)
1✔
120

121
    @model_validator(mode="after")
1✔
122
    def validate_unique_block_ids(self) -> ClassificationResult:
1✔
123
        """Validate that all block IDs in page_data are unique.
124

125
        Blocks must have unique IDs.
126
        Note: Blocks with IDs can be tracked in removal_reasons
127
        (which require block.id as keys for JSON serializability).
128
        """
129
        # Validate unique IDs
130
        block_ids = [b.id for b in self.page_data.blocks]
1✔
131
        if len(block_ids) != len(set(block_ids)):
1✔
132
            duplicates = [id_ for id_ in block_ids if block_ids.count(id_) > 1]
1✔
133
            raise ValueError(
1✔
134
                f"PageData blocks must have unique IDs. "
135
                f"Found duplicates: {set(duplicates)}"
136
            )
137
        return self
1✔
138

139
    def is_block_consumed(self, block: Blocks) -> bool:
1✔
140
        """Check if a block has been consumed by a constructed candidate.
141

142
        Args:
143
            block: The block to check
144

145
        Returns:
146
            True if the block has been consumed, False otherwise
147
        """
148
        return block.id in self._consumed_blocks
1✔
149

150
    def get_unconsumed_blocks(
1✔
151
        self, block_filter: type[Blocks] | tuple[type[Blocks], ...] | None = None
152
    ) -> Sequence[Blocks]:
153
        """Get all blocks that have not been consumed by any constructed candidate.
154

155
        This is useful during build() when a classifier needs to find additional
156
        blocks to consume without conflicting with other elements.
157

158
        Args:
159
            block_filter: Optional type or tuple of types to filter by.
160
                If provided, only blocks of these types are returned.
161

162
        Returns:
163
            List of unconsumed blocks, optionally filtered by type
164
        """
165
        # TODO I wonder if in future we should track unconsumed blocks
166
        # separately for performance if this becomes a bottleneck.
167
        unconsumed = [
1✔
168
            block
169
            for block in self.page_data.blocks
170
            if block.id not in self._consumed_blocks
171
        ]
172
        if block_filter is not None:
1✔
173
            unconsumed = [b for b in unconsumed if isinstance(b, block_filter)]
1✔
174
        return unconsumed
1✔
175

176
    def _register_classifier(self, label: str, classifier: LabelClassifier) -> None:
1✔
177
        """Register a classifier for a specific label.
178

179
        This is called automatically by LabelClassifier.score() and should not
180
        be called directly by external code.
181
        """
182
        self._classifiers[label] = classifier
1✔
183

184
    def build_all_for_label(self, label: str) -> Sequence[LegoPageElements]:
1✔
185
        """Build all candidates for a label using the registered classifier's build_all.
186

187
        This delegates to the classifier's build_all() method, allowing classifiers
188
        to implement custom coordination logic (e.g., Hungarian matching) before
189
        building individual candidates.
190

191
        Args:
192
            label: The label to build all candidates for
193

194
        Returns:
195
            List of successfully constructed LegoPageElements
196

197
        Raises:
198
            ValueError: If no classifier is registered for the label
199
        """
200
        classifier = self._classifiers.get(label)
1✔
201
        if not classifier:
1✔
UNCOV
202
            raise ValueError(f"No classifier registered for label '{label}'")
×
203

204
        log.debug(
1✔
205
            "[build_all] Starting build_all for '%s' on page %s",
206
            label,
207
            self.page_data.page_number,
208
        )
209
        result = classifier.build_all(self)
1✔
210
        log.debug(
1✔
211
            "[build_all] Completed build_all for '%s' on page %s: built %d elements",
212
            label,
213
            self.page_data.page_number,
214
            len(result),
215
        )
216
        return result
1✔
217

218
    def build(self, candidate: Candidate, **kwargs: Any) -> LegoPageElements:
1✔
219
        """Construct a candidate using the registered classifier.
220

221
        This is the entry point for top-down construction. If the build fails,
222
        all changes to candidate states and consumed blocks are automatically
223
        rolled back, ensuring transactional semantics.
224

225
        If a nested candidate fails due to conflicts, this method will attempt
226
        to create replacement candidates and retry the build.
227

228
        Args:
229
            candidate: The candidate to construct
230
            **kwargs: Additional keyword arguments passed to the classifier's
231
                build method. For example, DiagramClassifier accepts
232
                constraint_bbox to limit clustering.
233

234
        Returns:
235
            The constructed LegoPageElement
236

237
        Raises:
238
            CandidateFailedError: If the candidate cannot be built due to
239
                validation failures, conflicts, or other expected conditions.
240
                Callers should only catch this exception type, allowing
241
                programming errors to propagate.
242
        """
243
        if candidate.constructed:
1✔
244
            return candidate.constructed
1✔
245

246
        if candidate.failure_reason:
1✔
247
            raise CandidateFailedError(
1✔
248
                candidate, f"Candidate failed: {candidate.failure_reason}"
249
            )
250

251
        # Check if any source block is already consumed (pre-build check)
252
        self._check_blocks_not_consumed(candidate, candidate.source_blocks)
1✔
253

254
        classifier = self._classifiers.get(candidate.label)
1✔
255
        if not classifier:
1✔
UNCOV
256
            raise ValueError(f"No classifier registered for label '{candidate.label}'")
×
257

258
        log.debug(
1✔
259
            "[build] Starting build for '%s' at %s on page %s",
260
            candidate.label,
261
            candidate.bbox,
262
            self.page_data.page_number,
263
        )
264

265
        # Take snapshot before building for automatic rollback on failure
266
        snapshot = self._take_snapshot()
1✔
267
        original_source_blocks = list(candidate.source_blocks)
1✔
268

269
        try:
1✔
270
            element = classifier.build(candidate, self, **kwargs)
1✔
271
            candidate.constructed = element
1✔
272

273
            # Check if any NEW source blocks (added during build) are already consumed
274
            # This handles classifiers that consume additional blocks during build()
275
            new_blocks = [
1✔
276
                b for b in candidate.source_blocks if b not in original_source_blocks
277
            ]
278
            if new_blocks:
1✔
279
                log.debug(
1✔
280
                    "[build] Classifier added %d blocks during build for '%s': %s",
281
                    len(new_blocks),
282
                    candidate.label,
283
                    [b.id for b in new_blocks],
284
                )
285
                self._check_blocks_not_consumed(candidate, new_blocks)
1✔
286

287
            # Sync candidate bbox with constructed element's bbox.
288
            # The constructed element may have a different bbox (e.g., Step's
289
            # bbox includes diagram which is only determined at build time).
290
            if candidate.bbox != element.bbox:
1✔
291
                log.debug(
1✔
292
                    "[build] Updating candidate bbox from %s to %s - "
293
                    "This indicate the bbox changed between score and build, "
294
                    "and may indicate a classification bug",
295
                    candidate.bbox,
296
                    element.bbox,
297
                )
298
            candidate.bbox = element.bbox
1✔
299

300
            # Mark blocks as consumed
301
            log.debug(
1✔
302
                "[build] Marking %d blocks as consumed for '%s' at %s: %s",
303
                len(candidate.source_blocks),
304
                candidate.label,
305
                candidate.bbox,
306
                [b.id for b in candidate.source_blocks],
307
            )
308

309
            self._assert_no_duplicate_source_blocks(candidate)
1✔
310

311
            for block in candidate.source_blocks:
1✔
312
                self._consumed_blocks.add(block.id)
1✔
313

314
            # Fail other candidates that use these blocks
315
            self._fail_conflicting_candidates(candidate)
1✔
316

317
            return element
1✔
318
        except CandidateFailedError as e:
1✔
319
            # A nested candidate failed - rollback and check if we can retry
320
            self._restore_snapshot(snapshot)
1✔
321

322
            # If the failed candidate has a "Replaced by reduced candidate" reason,
323
            # we may be able to find the replacement and the caller can retry
324
            failed_candidate = e.candidate
1✔
325
            if (
1✔
326
                failed_candidate.failure_reason
327
                and "Replaced by reduced candidate" in failed_candidate.failure_reason
328
            ):
329
                # The failed candidate was replaced - caller should retry with
330
                # new candidates available
UNCOV
331
                log.debug(
×
332
                    "[build] Nested candidate %s (%s) was replaced, "
333
                    "propagating for retry",
334
                    failed_candidate.label,
335
                    failed_candidate.bbox,
336
                )
337
            raise
1✔
338
        except Exception:
1✔
339
            # Rollback all changes made during this build
340
            self._restore_snapshot(snapshot)
1✔
341
            raise
1✔
342

343
    def _take_snapshot(self) -> _BuildSnapshot:
1✔
344
        """Take a snapshot of all candidate states and consumed blocks."""
345
        candidate_states = {}
1✔
346
        for candidates in self.candidates.values():
1✔
347
            for c in candidates:
1✔
348
                candidate_states[id(c)] = (c.constructed, c.failure_reason)
1✔
349

350
        return _BuildSnapshot(
1✔
351
            candidate_states=candidate_states,
352
            consumed_blocks=self._consumed_blocks.copy(),
353
        )
354

355
    def _restore_snapshot(self, snapshot: _BuildSnapshot) -> None:
1✔
356
        """Restore candidate states and consumed blocks from a snapshot."""
357
        # Restore candidate states
358
        for candidates in self.candidates.values():
1✔
359
            for c in candidates:
1✔
360
                cid = id(c)
1✔
361
                if cid in snapshot.candidate_states:
1✔
362
                    c.constructed, c.failure_reason = snapshot.candidate_states[cid]
1✔
363

364
        # Restore consumed blocks
365
        self._consumed_blocks = snapshot.consumed_blocks.copy()
1✔
366

367
    def _check_blocks_not_consumed(
1✔
368
        self, candidate: Candidate, blocks: list[Blocks]
369
    ) -> None:
370
        """Check that none of the given blocks are already consumed.
371

372
        Args:
373
            candidate: The candidate trying to consume these blocks
374
            blocks: The blocks to check
375

376
        Raises:
377
            CandidateFailedError: If any block is already consumed
378
        """
379
        for block in blocks:
1✔
380
            if block.id in self._consumed_blocks:
1✔
381
                # Find who consumed it (for better error message)
382
                # This is expensive but only happens on failure
383
                winner_label = "unknown"
×
384
                for _label, cat_candidates in self.candidates.items():
×
385
                    for c in cat_candidates:
×
UNCOV
386
                        if c.constructed and any(
×
387
                            b.id == block.id for b in c.source_blocks
388
                        ):
389
                            winner_label = _label
×
UNCOV
390
                            break
×
391

392
                failure_msg = f"Block {block.id} already consumed by '{winner_label}'"
×
393
                candidate.failure_reason = failure_msg
×
UNCOV
394
                raise CandidateFailedError(candidate, failure_msg)
×
395

396
    def _assert_no_duplicate_source_blocks(self, candidate: Candidate) -> None:
1✔
397
        """Assert that a candidate has no duplicate blocks in source_blocks.
398

399
        This is a programming error check - duplicates indicate a bug in
400
        the classifier that created the candidate.
401

402
        Args:
403
            candidate: The candidate to check
404

405
        Raises:
406
            AssertionError: If duplicate block IDs are found
407
        """
408
        block_ids = [b.id for b in candidate.source_blocks]
1✔
409
        assert len(block_ids) == len(set(block_ids)), (
1✔
410
            f"Duplicate blocks in source_blocks for '{candidate.label}': {block_ids}"
411
        )
412

413
    def _fail_conflicting_candidates(self, winner: Candidate) -> None:
1✔
414
        """Mark other candidates sharing blocks with winner as failed.
415

416
        For candidates that support re-scoring, we try to create a reduced
417
        version without the conflicting blocks before failing them entirely.
418
        """
419
        winner_block_ids = {b.id for b in winner.source_blocks}
1✔
420

421
        if not winner_block_ids:
1✔
422
            return
1✔
423

424
        for label, candidates in self.candidates.items():
1✔
425
            for candidate in candidates:
1✔
426
                if candidate is winner:
1✔
427
                    continue
1✔
428
                if candidate.failure_reason:
1✔
429
                    continue
1✔
430

431
                # Check for overlap
432
                conflicting_block_ids = {
1✔
433
                    b.id for b in candidate.source_blocks if b.id in winner_block_ids
434
                }
435

436
                if not conflicting_block_ids:
1✔
437
                    continue
1✔
438

439
                # Fall back to failing the candidate
440
                candidate_block_ids = [b.id for b in candidate.source_blocks]
1✔
441
                failure_reason = (
1✔
442
                    f"Lost conflict to '{winner.label}' at {winner.bbox} "
443
                    f"(winner_blocks={sorted(winner_block_ids)}, "
444
                    f"candidate_blocks={candidate_block_ids}, "
445
                    f"conflicting={sorted(conflicting_block_ids)})"
446
                )
447
                candidate.failure_reason = failure_reason
1✔
448
                log.debug(
1✔
449
                    "[conflict] '%s' at %s failed: %s",
450
                    label,
451
                    candidate.bbox,
452
                    failure_reason,
453
                )
454

455
    def _validate_block_in_page_data(
1✔
456
        self, block: Blocks | None, param_name: str = "block"
457
    ) -> None:
458
        """Validate that a block is in PageData.
459

460
        Args:
461
            block: The block to validate (None is allowed and skips validation)
462
            param_name: Name of the parameter being validated (for error messages)
463

464
        Raises:
465
            ValueError: If block is not None and not in PageData.blocks
466
        """
467
        if block is not None and block not in self.page_data.blocks:
1✔
468
            raise ValueError(f"{param_name} must be in PageData.blocks. Block: {block}")
1✔
469

470
    @property
1✔
471
    def blocks(self) -> Sequence[Blocks]:
1✔
472
        """Get the blocks from the page data.
473

474
        Returns:
475
            List of blocks from the page data
476
        """
477
        return self.page_data.blocks
1✔
478

479
    @property
1✔
480
    def page(self) -> Page | None:
1✔
481
        """Returns the Page object built from this classification result."""
482
        page_candidates = self.get_built_candidates("page")
1✔
483
        if page_candidates:
1✔
484
            page = page_candidates[0].constructed
1✔
485
            assert isinstance(page, Page)
1✔
486
            return page
1✔
487
        return None
1✔
488

489
    # TODO Reconsider the methods below - some may be redundant.
490

491
    def get_candidates(self, label: str) -> Sequence[Candidate]:
1✔
492
        """Get all candidates for a specific label.
493

494
        Args:
495
            label: The label to get candidates for
496

497
        Returns:
498
            Sequence of candidates for that label (returns copy to prevent
499
            external modification)
500
        """
501
        return list(self.candidates.get(label, []))
1✔
502

503
    def get_scored_candidates(
1✔
504
        self,
505
        label: str,
506
        min_score: float = 0.0,
507
    ) -> Sequence[Candidate]:
508
        """Get candidates that have been scored, for use during scoring phase.
509

510
        **Use this method in _score() when working with dependency classifiers.**
511

512
        This returns candidates that have been scored but may not yet be
513
        constructed. During the scoring phase, candidates exist but their
514
        `constructed` field is None until build() is called.
515

516
        The returned candidates are sorted by score (highest first) and
517
        excludes candidates that have already failed (e.g., lost a conflict).
518

519
        Use get_built_candidates() instead when you need only successfully
520
        constructed candidates (e.g., in build() or after classification).
521

522
        Example:
523
            # In PreviewClassifier._score()
524
            step_number_candidates = result.get_scored_candidates("step_number")
525
            for cand in step_number_candidates:
526
                # Use candidate.bbox for spatial reasoning
527
                # Store candidate references in score_details for later
528

529
        Args:
530
            label: The label to get candidates for
531
            min_score: Optional minimum score threshold (default: 0.0)
532

533
        Returns:
534
            List of scored candidates sorted by score (highest first),
535
            excluding failed candidates.
536
        """
537
        candidates = self.get_candidates(label)
1✔
538

539
        # Filter to candidates that have been scored and haven't failed
540
        scored = [
1✔
541
            c
542
            for c in candidates
543
            if c.score_details is not None and c.failure_reason is None
544
        ]
545

546
        # Apply score threshold if specified
547
        if min_score > 0:
1✔
548
            scored = [c for c in scored if c.score >= min_score]
1✔
549

550
        # Sort by score descending
551
        # TODO add a tie breaker for determinism.
552
        scored.sort(key=lambda c: -c.score)
1✔
553

554
        return scored
1✔
555

556
    def get_built_candidates(
1✔
557
        self,
558
        label: str,
559
        min_score: float = 0.0,
560
    ) -> Sequence[Candidate]:
561
        """Get candidates that have been successfully built/constructed.
562

563
        **Use this method in build() or after classification is complete.**
564

565
        This returns only candidates where construction succeeded (i.e.,
566
        `candidate.constructed` is not None and there's no failure_reason).
567
        These are "valid" candidates whose elements can be safely accessed.
568

569
        Use get_scored_candidates() instead during the scoring phase when
570
        candidates may not yet be constructed.
571

572
        Example:
573
            # In PageClassifier.build()
574
            page_number_candidates = result.get_built_candidates("page_number")
575
            if page_number_candidates:
576
                page_number = page_number_candidates[0].constructed
577

578
        Args:
579
            label: The label to get candidates for
580
            min_score: Optional minimum score threshold (default: 0.0)
581

582
        Returns:
583
            List of successfully constructed candidates sorted by score
584
            (highest first).
585
        """
586
        candidates = self.get_candidates(label)
1✔
587

588
        # Filter to valid candidates (constructed and no failure)
589
        built = [c for c in candidates if c.is_valid]
1✔
590

591
        # Apply score threshold if specified
592
        if min_score > 0:
1✔
UNCOV
593
            built = [c for c in built if c.score >= min_score]
×
594

595
        # Sort by score descending
596
        # TODO add a tie breaker for determinism.
597
        built.sort(key=lambda c: -c.score)
1✔
598

599
        return built
1✔
600

601
    def get_all_candidates(self) -> dict[str, Sequence[Candidate]]:
1✔
602
        """Get all candidates across all labels.
603

604
        Returns:
605
            Dictionary mapping labels to their candidates (returns copy to
606
            prevent external modification)
607
        """
608
        return {label: list(cands) for label, cands in self.candidates.items()}
1✔
609

610
    def count_successful_candidates(self, label: str) -> int:
1✔
611
        """Count how many candidates were successfully constructed for a label.
612

613
        Test helper method that counts candidates where construction succeeded.
614

615
        Args:
616
            label: The label to count successful candidates for
617

618
        Returns:
619
            Count of successfully constructed candidates
620
        """
621
        return sum(1 for c in self.get_candidates(label) if c.constructed is not None)
1✔
622

623
    # TODO This is one of the slowest methods. I wonder if we can change
624
    # the internal data structures to make this faster.
625
    def get_all_candidates_for_block(self, block: Blocks) -> Sequence[Candidate]:
1✔
626
        """Get all candidates for a block across all labels.
627

628
        Searches across all labels to find candidates that used the given block
629
        as their source. For finding a candidate with a specific label, use
630
        get_candidate_for_block() instead.
631

632
        Args:
633
            block: The block to find candidates for
634

635
        Returns:
636
            List of all candidates across all labels with this block in source_blocks
637
        """
638
        results = []
1✔
639
        for candidates in self.candidates.values():
1✔
640
            for candidate in candidates:
×
641
                if block in candidate.source_blocks:
×
UNCOV
642
                    results.append(candidate)
×
643
        return results
1✔
644

645
    def get_candidate_for_block(self, block: Blocks, label: str) -> Candidate | None:
1✔
646
        """Get the candidate for a specific block with a specific label.
647

648
        Helper method for testing - returns the single candidate for the given
649
        block and label combination. Returns None if no such candidate exists.
650

651
        Args:
652
            block: The block to find the candidate for
653
            label: The label to search within
654

655
        Returns:
656
            The candidate if found, None otherwise
657

658
        Raises:
659
            ValueError: If multiple candidates exist for this block/label pair
660
        """
661
        candidates = [c for c in self.get_candidates(label) if block in c.source_blocks]
1✔
662

663
        if len(candidates) == 0:
1✔
664
            return None
1✔
665

666
        if len(candidates) == 1:
1✔
667
            return candidates[0]
1✔
668

UNCOV
669
        raise ValueError(
×
670
            f"Multiple candidates found for block {block.id} "
671
            f"with label '{label}'. Expected at most one."
672
        )
673

674
    def get_best_candidate(self, block: Blocks) -> Candidate | None:
1✔
675
        """Get the highest-scoring successfully constructed candidate for a block.
676

677
        When a block has candidates for multiple labels, this returns the one
678
        with the highest score. This is the "winning" candidate for reporting
679
        and output purposes.
680

681
        Args:
682
            block: The block to get the best candidate for
683

684
        Returns:
685
            The highest-scoring successfully constructed candidate, or None
686
            if no successfully constructed candidate exists
687
        """
688
        candidates = self.get_all_candidates_for_block(block)
1✔
689
        valid_candidates = [c for c in candidates if c.constructed is not None]
1✔
690

691
        if not valid_candidates:
1✔
692
            return None
1✔
693

694
        # Return the highest-scoring candidate
UNCOV
695
        return max(valid_candidates, key=lambda c: c.score)
×
696

697
    # TODO I think this API is broken - there can be multiple labels per block,
698
    # but we only return one here.
699
    def get_label(self, block: Blocks) -> str | None:
1✔
700
        """Get the label for a block from its highest-scoring constructed candidate.
701

702
        Returns the label of the successfully constructed candidate with the
703
        highest score for the given block, or None if no successfully
704
        constructed candidate exists.
705

706
        This is a convenience method equivalent to:
707
            candidate = result.get_best_candidate(block)
708
            return candidate.label if candidate else None
709

710
        Args:
711
            block: The block to get the label for
712

713
        Returns:
714
            The label string of the highest-scoring constructed candidate,
715
            None otherwise
716
        """
717
        best_candidate = self.get_best_candidate(block)
1✔
718
        return best_candidate.label if best_candidate else None
1✔
719

720
    def add_candidate(self, candidate: Candidate) -> None:
1✔
721
        """Add a single candidate.
722

723
        The label is extracted from candidate.label.
724

725
        Args:
726
            candidate: The candidate to add
727

728
        Raises:
729
            ValueError: If candidate has source_blocks that are not in PageData
730
        """
731
        for source_block in candidate.source_blocks:
1✔
732
            self._validate_block_in_page_data(source_block, "candidate.source_blocks")
1✔
733

734
        label = candidate.label
1✔
735
        if label not in self.candidates:
1✔
736
            self.candidates[label] = []
1✔
737
        self.candidates[label].append(candidate)
1✔
738

739
    # TODO Reconsider the removal API below - do we need it? We have been
740
    # capturing all blocks by a element.
741
    def mark_removed(self, block: Blocks, reason: RemovalReason) -> None:
1✔
742
        """Mark a block as removed with the given reason.
743

744
        Args:
745
            block: The block to mark as removed
746
            reason: The reason for removal
747

748
        Raises:
749
            ValueError: If block is not in PageData
750
        """
751
        self._validate_block_in_page_data(block, "block")
1✔
752
        self.removal_reasons[block.id] = reason
1✔
753

754
    def is_removed(self, block: Blocks) -> bool:
1✔
755
        """Check if a block has been marked for removal.
756

757
        Args:
758
            block: The block to check
759

760
        Returns:
761
            True if the block is marked for removal, False otherwise
762
        """
763
        return block.id in self.removal_reasons
1✔
764

765
    def count_unconsumed_blocks(self) -> int:
1✔
766
        """Count blocks that were neither removed nor consumed by a classifier.
767

768
        A block is considered "unconsumed" if it:
769
        - Was not marked for removal (not in removal_reasons)
770
        - Was not consumed during construction (not in _consumed_blocks)
771

772
        This is useful for tracking classification completeness and
773
        identifying blocks that were not recognized.
774

775
        Returns:
776
            Number of blocks that remain unconsumed
777
        """
778
        all_block_ids = {b.id for b in self.page_data.blocks}
1✔
779
        removed_ids = set(self.removal_reasons.keys())
1✔
780
        return len(all_block_ids - removed_ids - self._consumed_blocks)
1✔
781

782
    def get_removal_reason(self, block: Blocks) -> RemovalReason | None:
1✔
783
        """Get the reason why a block was removed.
784

785
        Args:
786
            block: The block to get the removal reason for
787

788
        Returns:
789
            The RemovalReason if the block was removed, None otherwise
790
        """
791
        return self.removal_reasons.get(block.id)
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc