• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19796282858

30 Nov 2025 08:32AM UTC coverage: 90.603% (-0.04%) from 90.646%
19796282858

push

github

bramp
test: update golden files for classifier with hints

Regenerated golden files using classifier config with font_hints and
page_hints for improved classification accuracy.

9835 of 10855 relevant lines covered (90.6%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

20.33
/src/build_a_long/pdf_extract/classifier/classifier_rules_test.py
1
"""Rule-based tests over real fixtures for the PDF element classifier.
2

3
This suite validates high-level invariants that must hold after classification.
4

5
Rules covered:
6
- No labeled element should be marked as deleted.
7
- Each element has at most one winner candidate.
8

9
Real fixture(s) live under this package's fixtures/ directory.
10
"""
11

12
import logging
1✔
13

14
import pytest
1✔
15

16
from build_a_long.pdf_extract.classifier import Candidate, classify_elements
1✔
17
from build_a_long.pdf_extract.classifier.classifier_config import ClassifierConfig
1✔
18
from build_a_long.pdf_extract.extractor import ExtractionResult, PageData
1✔
19
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
20
    Diagram,
21
    LegoPageElement,
22
    PartsList,
23
)
24
from build_a_long.pdf_extract.fixtures import (
1✔
25
    FIXTURES_DIR,
26
    RAW_FIXTURE_FILES,
27
    extract_element_id,
28
    load_classifier_config,
29
)
30

31
log = logging.getLogger(__name__)
1✔
32

33

34
def _load_pages_from_fixture(fixture_file: str) -> list[PageData]:
1✔
35
    """Load all pages from a fixture file.
36

37
    Args:
38
        fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')
39

40
    Returns:
41
        All pages from the extraction result
42

43
    Raises:
44
        ValueError: If the fixture contains no pages
45
    """
46
    fixture_path = FIXTURES_DIR / fixture_file
1✔
47
    extraction: ExtractionResult = ExtractionResult.model_validate_json(
1✔
48
        fixture_path.read_text()
49
    )  # type: ignore[assignment]
50

51
    if not extraction.pages:
1✔
52
        raise ValueError(f"No pages found in {fixture_file}")
×
53

54
    return extraction.pages
1✔
55

56

57
def _load_config_for_fixture(fixture_file: str) -> ClassifierConfig:
1✔
58
    """Load classifier config with hints for a fixture file.
59

60
    Args:
61
        fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')
62

63
    Returns:
64
        ClassifierConfig with font_size_hints and page_hints loaded from fixtures.
65
    """
66
    element_id = extract_element_id(fixture_file)
1✔
67
    return load_classifier_config(element_id)
1✔
68

69

70
class TestClassifierRules:
1✔
71
    """End-to-end rules that must hold on real pages after classification."""
72

73
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
74
    def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:
1✔
75
        """No element with a label should be marked as deleted.
76

77
        If an element has been classified with a label, it should not be deleted.
78
        This ensures that the classification and deletion logic don't conflict.
79
        """
80
        pages: list[PageData] = _load_pages_from_fixture(fixture_file)
×
81
        config = _load_config_for_fixture(fixture_file)
×
82

83
        for page_idx, page in enumerate(pages):
×
84
            # Run the full classification pipeline on the page
85
            result = classify_elements(page, config)
×
86

87
            # Find all elements that are both labeled and deleted
88
            # Build a map of source_block -> label for successfully constructed
89
            # candidates
90
            block_to_label: dict[int, str] = {}
×
91
            for label, candidates in result.get_all_candidates().items():
×
92
                for candidate in candidates:
×
93
                    if candidate.constructed is not None and candidate.source_blocks:
×
94
                        for block in candidate.source_blocks:
×
95
                            block_to_label[id(block)] = label
×
96

97
            labeled_and_deleted = []
×
98
            for elem in page.blocks:
×
99
                if id(elem) in block_to_label and result.is_removed(elem):
×
100
                    labeled_and_deleted.append((elem, block_to_label[id(elem)]))
×
101

102
            if labeled_and_deleted:
×
103
                log.error(
×
104
                    f"Found {len(labeled_and_deleted)} labeled elements "
105
                    f"that are deleted:"
106
                )
107
                for elem, label in labeled_and_deleted:
×
108
                    log.error(f"  - {label} id:{elem.id} bbox:{elem.bbox} [DELETED]")
×
109

110
            assert len(labeled_and_deleted) == 0, (
×
111
                f"Found {len(labeled_and_deleted)} labeled elements that are "
112
                f"deleted in {fixture_file} page {page_idx}. "
113
                f"Labeled elements should not be deleted."
114
            )
115

116
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
117
    def test_each_source_block_maps_to_one_element(self, fixture_file: str) -> None:
1✔
118
        """Each source block should map to at most one LegoPageElement.
119

120
        This validates that the classification pipeline doesn't create duplicate
121
        elements from the same source block. Each raw extraction block should
122
        produce at most one classified element in the final Page tree.
123
        """
124
        pages = _load_pages_from_fixture(fixture_file)
×
125
        config = _load_config_for_fixture(fixture_file)
×
126

127
        for page_idx, page_data in enumerate(pages):
×
128
            # Run the full classification pipeline on the page
129
            result = classify_elements(page_data, config)
×
130
            page = result.page
×
131

132
            if page is None:
×
133
                continue
×
134

135
            # Get all candidates from the classification result
136
            all_candidates = result.get_all_candidates()
×
137

138
            # Build a mapping from constructed element ID to candidate
139
            element_id_to_candidate: dict[int, Candidate] = {}
×
140
            for _label, candidates in all_candidates.items():
×
141
                for candidate in candidates:
×
142
                    if candidate.constructed is not None:
×
143
                        elem_id = id(candidate.constructed)
×
144
                        assert elem_id not in element_id_to_candidate, (
×
145
                            f"Source block id:"
146
                            f"{id(candidate.source_blocks[0]) if candidate.source_blocks else 'None'} "
147
                            f"produced multiple elements of type "
148
                            f"{candidate.constructed.__class__.__name__} "
149
                            f"in {fixture_file} page {page_idx}"
150
                        )
151
                        element_id_to_candidate[elem_id] = candidate
×
152

153
            blocks_to_element: dict[int, LegoPageElement] = {}
×
154

155
            # Traverse all LegoPageElements in the Page tree
156
            for element in page.iter_elements():
×
157
                elem_id = id(element)
×
158

159
                # Skip synthetic/fallback elements that weren't created by candidates
160
                # (e.g., empty PartsLists created when Step has no parts_list)
161
                if elem_id not in element_id_to_candidate:
×
162
                    continue
×
163

164
                candidate = element_id_to_candidate[elem_id]
×
165

166
                for source_block in candidate.source_blocks:
×
167
                    if source_block.id in blocks_to_element:
×
168
                        existing_element = blocks_to_element[source_block.id]
×
169
                        assert source_block.id not in blocks_to_element, (
×
170
                            f"Source block id:{source_block.id} "
171
                            f"({source_block.tag}) mapped to multiple "
172
                            f"elements in {fixture_file} page "
173
                            f"{page_data.page_number}:\n"
174
                            f"  First:  {existing_element}\n"
175
                            f"  Second: {element}\n"
176
                            f"  Source: {source_block}"
177
                        )
178
                    blocks_to_element[source_block.id] = element
×
179

180
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
181
    def test_all_lego_elements_come_from_candidates(self, fixture_file: str) -> None:
1✔
182
        """All LegoPageElements in the final Page tree must come from candidates.
183

184
        This validates that classifiers don't create "orphan" elements directly
185
        without a corresponding candidate. Every LegoPageElement should be either:
186
        1. The constructed element of a candidate, or
187
        2. A synthetic/fallback element (e.g., empty PartsList when Step has no
188
           parts_list candidate)
189

190
        Ensures proper tracking of all elements through the classification pipeline.
191
        """
192
        pages = _load_pages_from_fixture(fixture_file)
×
193
        config = _load_config_for_fixture(fixture_file)
×
194

195
        for page_idx, page_data in enumerate(pages):
×
196
            # Run the full classification pipeline on the page
197
            result = classify_elements(page_data, config)
×
198
            page = result.page
×
199

200
            if page is None:
×
201
                continue
×
202

203
            # Build a set of all constructed element IDs from candidates
204
            all_candidates = result.get_all_candidates()
×
205
            constructed_element_ids: set[int] = set()
×
206
            for _label, candidates in all_candidates.items():
×
207
                for candidate in candidates:
×
208
                    if candidate.constructed is not None:
×
209
                        constructed_element_ids.add(id(candidate.constructed))
×
210

211
            # Traverse all LegoPageElements in the Page tree
212
            orphan_elements: list[tuple[LegoPageElement, str]] = []
×
213
            for element in page.iter_elements():
×
214
                elem_id = id(element)
×
215
                elem_type = element.__class__.__name__
×
216

217
                # Skip the Page itself (it's the root container)
218
                if elem_type == "Page":
×
219
                    continue
×
220

221
                # Check if this element came from a candidate
222
                if elem_id not in constructed_element_ids:
×
223
                    # TODO Remove the following lines
224
                    # Known synthetic/fallback elements that are expected:
225
                    # - Empty PartsList when Step has no parts_list candidate
226
                    # - Diagram when Step couldn't find a matching diagram candidate
227
                    if isinstance(element, PartsList) and len(element.parts) == 0:
×
228
                        continue
×
229
                    if isinstance(element, Diagram):
×
230
                        # Fallback diagrams are allowed when StepClassifier
231
                        # can't find a matching diagram candidate
232
                        continue
×
233

234
                    orphan_elements.append((element, elem_type))
×
235

236
            if orphan_elements:
×
237
                log.error(
×
238
                    f"Found {len(orphan_elements)} orphan elements not from "
239
                    f"candidates in {fixture_file} page {page_idx}:"
240
                )
241
                for elem, elem_type in orphan_elements:
×
242
                    log.error(f"  - {elem_type} bbox:{elem.bbox}")
×
243

244
            assert len(orphan_elements) == 0, (
×
245
                f"Found {len(orphan_elements)} orphan LegoPageElements not from "
246
                f"candidates in {fixture_file} page {page_idx}. "
247
                f"All elements should come from candidates or be known fallbacks."
248
            )
249

250
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
251
    def test_no_orphaned_constructed_candidates(self, fixture_file: str) -> None:
1✔
252
        """No candidate marked constructed without being in the final tree.
253

254
        This validates the transactional rollback semantics of build():
255
        - If a parent classifier's build() fails, all sub-candidates it built
256
          should be rolled back (constructed = None)
257
        - Only candidates that are actually used in the final Page tree should
258
          remain marked as constructed
259

260
        This catches bugs where:
261
        1. A classifier builds sub-candidates (e.g., step builds step_number)
262
        2. The classifier then fails (e.g., parts_list build fails)
263
        3. The step_number candidate remains orphaned with constructed set,
264
           but not actually used in the final tree
265
        """
266
        pages = _load_pages_from_fixture(fixture_file)
×
267
        config = _load_config_for_fixture(fixture_file)
×
268

269
        for page_idx, page_data in enumerate(pages):
×
270
            # Run the full classification pipeline on the page
271
            result = classify_elements(page_data, config)
×
272
            page = result.page
×
273

274
            if page is None:
×
275
                continue
×
276

277
            # Build set of all element IDs actually used in the final Page tree
278
            used_element_ids: set[int] = set()
×
279
            for element in page.iter_elements():
×
280
                used_element_ids.add(id(element))
×
281

282
            # Check all candidates for orphaned constructed elements
283
            all_candidates = result.get_all_candidates()
×
284
            orphaned_candidates: list[tuple[str, Candidate]] = []
×
285

286
            for label, candidates in all_candidates.items():
×
287
                for candidate in candidates:
×
288
                    # If candidate is marked as constructed but not in the tree
289
                    if (
×
290
                        candidate.constructed is not None
291
                        and id(candidate.constructed) not in used_element_ids
292
                    ):
293
                        orphaned_candidates.append((label, candidate))
×
294

295
            if orphaned_candidates:
×
296
                log.error(
×
297
                    f"Found {len(orphaned_candidates)} orphaned constructed "
298
                    f"candidates in {fixture_file} page {page_idx}:"
299
                )
300
                for label, candidate in orphaned_candidates:
×
301
                    elem_type = candidate.constructed.__class__.__name__
×
302
                    log.error(
×
303
                        f"  - {label}: {elem_type} bbox:{candidate.bbox} "
304
                        f"score:{candidate.score:.3f} "
305
                        f"failure:{candidate.failure_reason}"
306
                    )
307

308
            assert len(orphaned_candidates) == 0, (
×
309
                f"Found {len(orphaned_candidates)} orphaned constructed candidates "
310
                f"in {fixture_file} page {page_idx}. "
311
                f"Candidates marked as constructed should either be in the final "
312
                f"Page tree or rolled back to constructed=None. "
313
                f"This indicates a transactional rollback failure."
314
            )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc