• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19727090583

27 Nov 2025 06:15AM UTC coverage: 89.781% (+0.8%) from 88.977%
19727090583

push

github

bramp
Multiple improves to classifers, specific around documentations, removing unused fields, and improving type hinting.

26 of 26 new or added lines in 14 files covered. (100.0%)

94 existing lines in 17 files now uncovered.

7327 of 8161 relevant lines covered (89.78%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

83.9
/src/build_a_long/pdf_extract/classifier/classifier_rules_test.py
1
"""Rule-based tests over real fixtures for the PDF element classifier.
2

3
This suite validates high-level invariants that must hold after classification.
4

5
Rules covered:
6
- No labeled element should be marked as deleted.
7
- Each element has at most one winner candidate.
8

9
Real fixture(s) live under this package's fixtures/ directory.
10
"""
11

12
import logging
1✔
13

14
import pytest
1✔
15

16
from build_a_long.pdf_extract.classifier import classify_elements
1✔
17
from build_a_long.pdf_extract.classifier.classification_result import Candidate
1✔
18
from build_a_long.pdf_extract.extractor import ExtractionResult, PageData
1✔
19
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
20
    Diagram,
21
    LegoPageElement,
22
    PartsList,
23
)
24
from build_a_long.pdf_extract.fixtures import FIXTURES_DIR, RAW_FIXTURE_FILES
1✔
25

26
log = logging.getLogger(__name__)
1✔
27

28

29
def _load_pages_from_fixture(fixture_file: str) -> list[PageData]:
1✔
30
    """Load all pages from a fixture file.
31

32
    Args:
33
        fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')
34

35
    Returns:
36
        All pages from the extraction result
37

38
    Raises:
39
        ValueError: If the fixture contains no pages
40
    """
41
    fixture_path = FIXTURES_DIR / fixture_file
1✔
42
    extraction: ExtractionResult = ExtractionResult.model_validate_json(
1✔
43
        fixture_path.read_text()
44
    )  # type: ignore[assignment]
45

46
    if not extraction.pages:
1✔
47
        raise ValueError(f"No pages found in {fixture_file}")
×
48

49
    return extraction.pages
1✔
50

51

52
class TestClassifierRules:
1✔
53
    """End-to-end rules that must hold on real pages after classification."""
54

55
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
56
    def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:
1✔
57
        """No element with a label should be marked as deleted.
58

59
        If an element has been classified with a label, it should not be deleted.
60
        This ensures that the classification and deletion logic don't conflict.
61
        """
62
        pages: list[PageData] = _load_pages_from_fixture(fixture_file)
1✔
63

64
        for page_idx, page in enumerate(pages):
1✔
65
            # Run the full classification pipeline on the page
66
            result = classify_elements(page)
1✔
67

68
            # Find all elements that are both labeled and deleted
69
            # Build a map of source_block -> label for successfully constructed
70
            # candidates
71
            block_to_label: dict[int, str] = {}
1✔
72
            for label, candidates in result.get_all_candidates().items():
1✔
73
                for candidate in candidates:
1✔
74
                    if candidate.constructed is not None and candidate.source_blocks:
1✔
75
                        for block in candidate.source_blocks:
1✔
76
                            block_to_label[id(block)] = label
1✔
77

78
            labeled_and_deleted = []
1✔
79
            for elem in page.blocks:
1✔
80
                if id(elem) in block_to_label and result.is_removed(elem):
1✔
81
                    labeled_and_deleted.append((elem, block_to_label[id(elem)]))
×
82

83
            if labeled_and_deleted:
1✔
84
                log.error(
×
85
                    f"Found {len(labeled_and_deleted)} labeled elements "
86
                    f"that are deleted:"
87
                )
88
                for elem, label in labeled_and_deleted:
×
89
                    log.error(f"  - {label} id:{elem.id} bbox:{elem.bbox} [DELETED]")
×
90

91
            assert len(labeled_and_deleted) == 0, (
1✔
92
                f"Found {len(labeled_and_deleted)} labeled elements that are "
93
                f"deleted in {fixture_file} page {page_idx}. "
94
                f"Labeled elements should not be deleted."
95
            )
96

97
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
98
    def test_each_source_block_maps_to_one_element(self, fixture_file: str) -> None:
1✔
99
        """Each source block should map to at most one LegoPageElement.
100

101
        This validates that the classification pipeline doesn't create duplicate
102
        elements from the same source block. Each raw extraction block should
103
        produce at most one classified element in the final Page tree.
104
        """
105
        # TODO: Remove this skip once the "winning" concept is implemented
106
        # These fixtures have Parts that appear in multiple PartsLists due to
107
        # overlapping Drawing bboxes. The winning concept will prevent duplicate
108
        # Part usage across candidates.
109
        if fixture_file in ["6509377_page_014_raw.json", "6509377_page_015_raw.json"]:
1✔
110
            pytest.skip(
1✔
111
                "Skipping until 'winning' concept prevents duplicate Part usage "
112
                "across multiple PartsList candidates"
113
            )
114

115
        pages = _load_pages_from_fixture(fixture_file)
1✔
116

117
        for page_idx, page_data in enumerate(pages):
1✔
118
            # Run the full classification pipeline on the page
119
            result = classify_elements(page_data)
1✔
120
            page = result.page
1✔
121

122
            if page is None:
1✔
123
                continue
×
124

125
            # Get all candidates from the classification result
126
            all_candidates = result.get_all_candidates()
1✔
127

128
            # Build a mapping from constructed element ID to candidate
129
            element_id_to_candidate: dict[int, Candidate] = {}
1✔
130
            for _label, candidates in all_candidates.items():
1✔
131
                for candidate in candidates:
1✔
132
                    if candidate.constructed is not None:
1✔
133
                        elem_id = id(candidate.constructed)
1✔
134
                        assert elem_id not in element_id_to_candidate, (
1✔
135
                            f"Source block id:"
136
                            f"{id(candidate.source_blocks[0]) if candidate.source_blocks else 'None'} "
137
                            f"produced multiple elements of type "
138
                            f"{candidate.constructed.__class__.__name__} "
139
                            f"in {fixture_file} page {page_idx}"
140
                        )
141
                        element_id_to_candidate[elem_id] = candidate
1✔
142

143
            blocks_to_element: dict[int, LegoPageElement] = {}
1✔
144

145
            # Traverse all LegoPageElements in the Page tree
146
            for element in page.iter_elements():
1✔
147
                elem_id = id(element)
1✔
148

149
                # Skip synthetic/fallback elements that weren't created by candidates
150
                # (e.g., empty PartsLists created when Step has no parts_list)
151
                if elem_id not in element_id_to_candidate:
1✔
152
                    continue
1✔
153

154
                candidate = element_id_to_candidate[elem_id]
1✔
155

156
                for source_block in candidate.source_blocks:
1✔
157
                    if source_block.id in blocks_to_element:
1✔
158
                        existing_element = blocks_to_element[source_block.id]
×
159
                        assert source_block.id not in blocks_to_element, (
×
160
                            f"Source block id:{source_block.id} "
161
                            f"({source_block.tag}) mapped to multiple "
162
                            f"elements in {fixture_file} page "
163
                            f"{page_data.page_number}:\n"
164
                            f"  First:  {existing_element}\n"
165
                            f"  Second: {element}\n"
166
                            f"  Source: {source_block}"
167
                        )
168
                    blocks_to_element[source_block.id] = element
1✔
169

170
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
171
    def test_all_lego_elements_come_from_candidates(self, fixture_file: str) -> None:
1✔
172
        """All LegoPageElements in the final Page tree must come from candidates.
173

174
        This validates that classifiers don't create "orphan" elements directly
175
        without a corresponding candidate. Every LegoPageElement should be either:
176
        1. The constructed element of a candidate, or
177
        2. A synthetic/fallback element (e.g., empty PartsList when Step has no
178
           parts_list candidate)
179

180
        Ensures proper tracking of all elements through the classification pipeline.
181
        """
182
        pages = _load_pages_from_fixture(fixture_file)
1✔
183

184
        for page_idx, page_data in enumerate(pages):
1✔
185
            # Run the full classification pipeline on the page
186
            result = classify_elements(page_data)
1✔
187
            page = result.page
1✔
188

189
            if page is None:
1✔
190
                continue
×
191

192
            # Build a set of all constructed element IDs from candidates
193
            all_candidates = result.get_all_candidates()
1✔
194
            constructed_element_ids: set[int] = set()
1✔
195
            for _label, candidates in all_candidates.items():
1✔
196
                for candidate in candidates:
1✔
197
                    if candidate.constructed is not None:
1✔
198
                        constructed_element_ids.add(id(candidate.constructed))
1✔
199

200
            # Traverse all LegoPageElements in the Page tree
201
            orphan_elements: list[tuple[LegoPageElement, str]] = []
1✔
202
            for element in page.iter_elements():
1✔
203
                elem_id = id(element)
1✔
204
                elem_type = element.__class__.__name__
1✔
205

206
                # Skip the Page itself (it's the root container)
207
                if elem_type == "Page":
1✔
208
                    continue
1✔
209

210
                # Check if this element came from a candidate
211
                if elem_id not in constructed_element_ids:
1✔
212
                    # TODO Remove the following lines
213
                    # Known synthetic/fallback elements that are expected:
214
                    # - Empty PartsList when Step has no parts_list candidate
215
                    # - Diagram when Step couldn't find a matching diagram candidate
216
                    if isinstance(element, PartsList) and len(element.parts) == 0:
1✔
217
                        continue
1✔
218
                    if isinstance(element, Diagram):
1✔
219
                        # Fallback diagrams are allowed when StepClassifier
220
                        # can't find a matching diagram candidate
221
                        continue
1✔
222

223
                    orphan_elements.append((element, elem_type))
×
224

225
            if orphan_elements:
1✔
226
                log.error(
×
227
                    f"Found {len(orphan_elements)} orphan elements not from "
228
                    f"candidates in {fixture_file} page {page_idx}:"
229
                )
230
                for elem, elem_type in orphan_elements:
×
231
                    log.error(f"  - {elem_type} bbox:{elem.bbox}")
×
232

233
            assert len(orphan_elements) == 0, (
1✔
234
                f"Found {len(orphan_elements)} orphan LegoPageElements not from "
235
                f"candidates in {fixture_file} page {page_idx}. "
236
                f"All elements should come from candidates or be known fallbacks."
237
            )
238

239
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
240
    def test_no_orphaned_constructed_candidates(self, fixture_file: str) -> None:
1✔
241
        """No candidate marked constructed without being in the final tree.
242

243
        This validates the transactional rollback semantics of build():
244
        - If a parent classifier's build() fails, all sub-candidates it built
245
          should be rolled back (constructed = None)
246
        - Only candidates that are actually used in the final Page tree should
247
          remain marked as constructed
248

249
        This catches bugs where:
250
        1. A classifier builds sub-candidates (e.g., step builds step_number)
251
        2. The classifier then fails (e.g., parts_list build fails)
252
        3. The step_number candidate remains orphaned with constructed set,
253
           but not actually used in the final tree
254
        """
255
        pages = _load_pages_from_fixture(fixture_file)
1✔
256

257
        for page_idx, page_data in enumerate(pages):
1✔
258
            # Run the full classification pipeline on the page
259
            result = classify_elements(page_data)
1✔
260
            page = result.page
1✔
261

262
            if page is None:
1✔
UNCOV
263
                continue
×
264

265
            # Build set of all element IDs actually used in the final Page tree
266
            used_element_ids: set[int] = set()
1✔
267
            for element in page.iter_elements():
1✔
268
                used_element_ids.add(id(element))
1✔
269

270
            # Check all candidates for orphaned constructed elements
271
            all_candidates = result.get_all_candidates()
1✔
272
            orphaned_candidates: list[tuple[str, Candidate]] = []
1✔
273

274
            for label, candidates in all_candidates.items():
1✔
275
                for candidate in candidates:
1✔
276
                    # If candidate is marked as constructed but not in the tree
277
                    if (
1✔
278
                        candidate.constructed is not None
279
                        and id(candidate.constructed) not in used_element_ids
280
                    ):
UNCOV
281
                        orphaned_candidates.append((label, candidate))
×
282

283
            if orphaned_candidates:
1✔
UNCOV
284
                log.error(
×
285
                    f"Found {len(orphaned_candidates)} orphaned constructed "
286
                    f"candidates in {fixture_file} page {page_idx}:"
287
                )
UNCOV
288
                for label, candidate in orphaned_candidates:
×
UNCOV
289
                    elem_type = candidate.constructed.__class__.__name__
×
UNCOV
290
                    log.error(
×
291
                        f"  - {label}: {elem_type} bbox:{candidate.bbox} "
292
                        f"score:{candidate.score:.3f} "
293
                        f"failure:{candidate.failure_reason}"
294
                    )
295

296
            assert len(orphaned_candidates) == 0, (
1✔
297
                f"Found {len(orphaned_candidates)} orphaned constructed candidates "
298
                f"in {fixture_file} page {page_idx}. "
299
                f"Candidates marked as constructed should either be in the final "
300
                f"Page tree or rolled back to constructed=None. "
301
                f"This indicates a transactional rollback failure."
302
            )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc