• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19615175570

23 Nov 2025 06:03PM UTC coverage: 91.259% (-0.1%) from 91.376%
19615175570

push

github

bramp
fix: Minor improvement to the classifier_rules.

1 of 3 new or added lines in 1 file covered. (33.33%)

210 existing lines in 23 files now uncovered.

6160 of 6750 relevant lines covered (91.26%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

86.89
/src/build_a_long/pdf_extract/classifier/classifier_rules_test.py
1
"""Rule-based tests over real fixtures for the PDF element classifier.
2

3
This suite validates high-level invariants that must hold after classification.
4

5
Rules covered:
6
- No labeled element should be marked as deleted.
7
- Each element has at most one winner candidate.
8

9
Real fixture(s) live under this package's fixtures/ directory.
10
"""
11

12
import logging
1✔
13

14
import pytest
1✔
15

16
from build_a_long.pdf_extract.classifier import classify_elements
1✔
17
from build_a_long.pdf_extract.classifier.classification_result import Candidate
1✔
18
from build_a_long.pdf_extract.extractor import ExtractionResult, PageData
1✔
19
from build_a_long.pdf_extract.extractor.lego_page_elements import LegoPageElement
1✔
20
from build_a_long.pdf_extract.fixtures import FIXTURES_DIR, RAW_FIXTURE_FILES
1✔
21

22
log = logging.getLogger(__name__)
1✔
23

24

25
def _load_pages_from_fixture(fixture_file: str) -> list[PageData]:
1✔
26
    """Load all pages from a fixture file.
27

28
    Args:
29
        fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')
30

31
    Returns:
32
        All pages from the extraction result
33

34
    Raises:
35
        ValueError: If the fixture contains no pages
36
    """
37
    fixture_path = FIXTURES_DIR / fixture_file
1✔
38
    extraction: ExtractionResult = ExtractionResult.model_validate_json(
1✔
39
        fixture_path.read_text()
40
    )  # type: ignore[assignment]
41

42
    if not extraction.pages:
1✔
UNCOV
43
        raise ValueError(f"No pages found in {fixture_file}")
×
44

45
    return extraction.pages
1✔
46

47

48
class TestClassifierRules:
1✔
49
    """End-to-end rules that must hold on real pages after classification."""
50

51
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
52
    def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:
1✔
53
        """No element with a label should be marked as deleted.
54

55
        If an element has been classified with a label, it should not be deleted.
56
        This ensures that the classification and deletion logic don't conflict.
57
        """
58
        pages: list[PageData] = _load_pages_from_fixture(fixture_file)
1✔
59

60
        for page_idx, page in enumerate(pages):
1✔
61
            # Run the full classification pipeline on the page
62
            result = classify_elements(page)
1✔
63

64
            # Find all elements that are both labeled and deleted
65
            # Build a map of source_block -> label for successfully constructed candidates
66
            block_to_label: dict[int, str] = {}
1✔
67
            for label, candidates in result.get_all_candidates().items():
1✔
68
                for candidate in candidates:
1✔
69
                    if (
1✔
70
                        candidate.constructed is not None
71
                        and candidate.source_block is not None
72
                    ):
73
                        block_to_label[id(candidate.source_block)] = label
1✔
74

75
            labeled_and_deleted = []
1✔
76
            for elem in page.blocks:
1✔
77
                if id(elem) in block_to_label and result.is_removed(elem):
1✔
UNCOV
78
                    labeled_and_deleted.append((elem, block_to_label[id(elem)]))
×
79

80
            if labeled_and_deleted:
1✔
UNCOV
81
                log.error(
×
82
                    f"Found {len(labeled_and_deleted)} labeled elements "
83
                    f"that are deleted:"
84
                )
UNCOV
85
                for elem, label in labeled_and_deleted:
×
UNCOV
86
                    log.error(f"  - {label} id:{elem.id} bbox:{elem.bbox} [DELETED]")
×
87

88
            assert len(labeled_and_deleted) == 0, (
1✔
89
                f"Found {len(labeled_and_deleted)} labeled elements that are "
90
                f"deleted in {fixture_file} page {page_idx}. "
91
                f"Labeled elements should not be deleted."
92
            )
93

94
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
95
    def test_each_source_block_maps_to_one_element(self, fixture_file: str) -> None:
1✔
96
        """Each source block should map to at most one LegoPageElement.
97

98
        This validates that the classification pipeline doesn't create duplicate
99
        elements from the same source block. Each raw extraction block should
100
        produce at most one classified element in the final Page tree.
101
        """
102
        pages = _load_pages_from_fixture(fixture_file)
1✔
103

104
        for page_idx, page_data in enumerate(pages):
1✔
105
            # Run the full classification pipeline on the page
106
            result = classify_elements(page_data)
1✔
107
            page = result.page
1✔
108

109
            if page is None:
1✔
UNCOV
110
                continue
×
111

112
            # Get all candidates from the classification result
113
            all_candidates = result.get_all_candidates()
1✔
114

115
            # Build a mapping from constructed element ID to candidate
116
            element_id_to_candidate: dict[int, Candidate] = {}
1✔
117
            for _label, candidates in all_candidates.items():
1✔
118
                for candidate in candidates:
1✔
119
                    if candidate.constructed is not None:
1✔
120
                        elem_id = id(candidate.constructed)
1✔
121
                        assert elem_id not in element_id_to_candidate, (
1✔
122
                            f"Source block id:{id(candidate.source_block)} "
123
                            f"produced multiple elements of type "
124
                            f"{candidate.constructed.__class__.__name__} "
125
                            f"in {fixture_file} page {page_idx}"
126
                        )
127
                        element_id_to_candidate[elem_id] = candidate
1✔
128

129
            blocks_to_element: dict[int, LegoPageElement] = {}
1✔
130

131
            # Traverse all LegoPageElements in the Page tree
132
            for element in page.iter_elements():
1✔
133
                elem_id = id(element)
1✔
134

135
                # Skip synthetic/fallback elements that weren't created by candidates
136
                # (e.g., empty PartsLists created when Step has no parts_list)
137
                if elem_id not in element_id_to_candidate:
1✔
138
                    continue
1✔
139

140
                candidate = element_id_to_candidate[elem_id]
1✔
141

142
                if candidate.source_block:
1✔
143
                    if candidate.source_block.id in blocks_to_element:
1✔
NEW
144
                        existing_element = blocks_to_element[candidate.source_block.id]
×
NEW
145
                        assert candidate.source_block.id not in blocks_to_element, (
×
146
                            f"Source block id:{candidate.source_block.id} "
147
                            f"({candidate.source_block.tag}) mapped to multiple "
148
                            f"elements in {fixture_file} page {page_data.page_number}:\n"
149
                            f"  First:  {existing_element}\n"
150
                            f"  Second: {element}\n"
151
                            f"  Source: {candidate.source_block}"
152
                        )
153
                    blocks_to_element[candidate.source_block.id] = element
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc