• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19556130039

21 Nov 2025 12:48AM UTC coverage: 90.819% (-0.05%) from 90.867%
19556130039

push

github

bramp
Updated the golden fixtures.

5025 of 5533 relevant lines covered (90.82%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

86.0
/src/build_a_long/pdf_extract/classifier/classifier_rules_test.py
1
"""Rule-based tests over real fixtures for the PDF element classifier.
2

3
This suite validates high-level invariants that must hold after classification.
4

5
Rules covered:
6
- No labeled element should be marked as deleted.
7
- Each element has at most one winner candidate.
8

9
Real fixture(s) live under this package's fixtures/ directory.
10
"""
11

12
import logging
1✔
13

14
import pytest
1✔
15

16
from build_a_long.pdf_extract.classifier import classify_elements
1✔
17
from build_a_long.pdf_extract.extractor import ExtractionResult, PageData
1✔
18
from build_a_long.pdf_extract.fixtures import FIXTURES_DIR, RAW_FIXTURE_FILES
1✔
19

20
log = logging.getLogger(__name__)
1✔
21

22

23
def _load_pages_from_fixture(fixture_file: str) -> list[PageData]:
1✔
24
    """Load all pages from a fixture file.
25

26
    Args:
27
        fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')
28

29
    Returns:
30
        All pages from the extraction result
31

32
    Raises:
33
        ValueError: If the fixture contains no pages
34
    """
35
    fixture_path = FIXTURES_DIR / fixture_file
1✔
36
    extraction: ExtractionResult = ExtractionResult.model_validate_json(
1✔
37
        fixture_path.read_text()
38
    )  # type: ignore[assignment]
39

40
    if not extraction.pages:
1✔
41
        raise ValueError(f"No pages found in {fixture_file}")
×
42

43
    return extraction.pages
1✔
44

45

46
class TestClassifierRules:
1✔
47
    """End-to-end rules that must hold on real pages after classification."""
48

49
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
50
    def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:
1✔
51
        """No element with a label should be marked as deleted.
52

53
        If an element has been classified with a label, it should not be deleted.
54
        This ensures that the classification and deletion logic don't conflict.
55
        """
56
        pages = _load_pages_from_fixture(fixture_file)
1✔
57

58
        for page_idx, page in enumerate(pages):
1✔
59
            # Run the full classification pipeline on the page
60
            result = classify_elements(page)
1✔
61

62
            # Find all elements that are both labeled and deleted
63
            # Build a map of source_block -> label for successfully constructed candidates
64
            block_to_label: dict[int, str] = {}
1✔
65
            for label, candidates in result.get_all_candidates().items():
1✔
66
                for candidate in candidates:
1✔
67
                    if (
1✔
68
                        candidate.constructed is not None
69
                        and candidate.source_block is not None
70
                    ):
71
                        block_to_label[id(candidate.source_block)] = label
1✔
72

73
            labeled_and_deleted = []
1✔
74
            for elem in page.blocks:
1✔
75
                if id(elem) in block_to_label and result.is_removed(elem):
1✔
76
                    labeled_and_deleted.append((elem, block_to_label[id(elem)]))
×
77

78
            if labeled_and_deleted:
1✔
79
                log.error(
×
80
                    f"Found {len(labeled_and_deleted)} labeled elements "
81
                    f"that are deleted:"
82
                )
83
                for elem, label in labeled_and_deleted:
×
84
                    log.error(f"  - {label} id:{elem.id} bbox:{elem.bbox} [DELETED]")
×
85

86
            assert len(labeled_and_deleted) == 0, (
1✔
87
                f"Found {len(labeled_and_deleted)} labeled elements that are "
88
                f"deleted in {fixture_file} page {page_idx}. "
89
                f"Labeled elements should not be deleted."
90
            )
91

92
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
93
    def test_each_element_has_at_most_one_winner(self, fixture_file: str) -> None:
1✔
94
        """Each element should have at most one winner candidate across all labels.
95

96
        An element can have multiple candidates across different labels, but only
97
        one of them should be marked as a winner. This ensures classification
98
        decisions are unambiguous.
99
        """
100
        pages = _load_pages_from_fixture(fixture_file)
1✔
101

102
        for page_idx, page in enumerate(pages):
1✔
103
            # Run the full classification pipeline on the page
104
            result = classify_elements(page)
1✔
105

106
            # Track which blocks have successful constructions, and for which label
107
            block_to_winning_label: dict[int, str] = {}
1✔
108

109
            # Check all successful candidates across all labels
110
            all_candidates = result.get_all_candidates()
1✔
111
            for label, candidates in all_candidates.items():
1✔
112
                for candidate in candidates:
1✔
113
                    # Only consider successfully constructed candidates
114
                    if candidate.constructed is None:
1✔
115
                        continue
1✔
116

117
                    # Skip synthetic candidates (no source block)
118
                    if candidate.source_block is None:
1✔
119
                        continue
1✔
120

121
                    block_id = candidate.source_block.id
1✔
122

123
                    # Check if this block already has a successful construction
124
                    if block_id in block_to_winning_label:
1✔
125
                        existing_label = block_to_winning_label[block_id]
×
126
                        pytest.fail(
×
127
                            f"Block {block_id} in {fixture_file} page {page_idx} "
128
                            f"has multiple successful constructions: '{existing_label}' "
129
                            f"and '{label}'. Each block should have at most one successful construction."
130
                        )
131

132
                    block_to_winning_label[block_id] = label
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc