• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 20361865516

19 Dec 2025 06:25AM UTC coverage: 89.13% (-0.002%) from 89.132%
20361865516

push

github

bramp
Fix lint errors: line length, unused imports, and YAML issues

- Add ruff isort configuration with known-first-party for build_a_long
- Add per-file E501 ignore for legocom_test.py (JSON test data)
- Create .yamllint config to relax strict YAML rules
- Fix E501 line length errors by wrapping long comments and strings
- Fix F841 unused variable errors
- Fix PLC0415 import-at-non-top-level errors
- Fix SIM108 ternary simplification errors

12 of 14 new or added lines in 8 files covered. (85.71%)

78 existing lines in 6 files now uncovered.

12915 of 14490 relevant lines covered (89.13%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

20.16
/src/build_a_long/pdf_extract/classifier/classifier_rules_test.py
1
"""Rule-based tests over real fixtures for the PDF element classifier.
2

3
This suite validates high-level invariants that must hold after classification.
4

5
Rules covered:
6
- No labeled element should be marked as deleted.
7
- Each element has at most one winner candidate.
8

9
Real fixture(s) live under this package's fixtures/ directory.
10
"""
11

12
import logging
1✔
13

14
import pytest
1✔
15

16
from build_a_long.pdf_extract.classifier import Candidate, classify_elements
1✔
17
from build_a_long.pdf_extract.classifier.classifier_config import ClassifierConfig
1✔
18
from build_a_long.pdf_extract.extractor import ExtractionResult, PageData
1✔
19
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
20
    Diagram,
21
    LegoPageElement,
22
    PartsList,
23
)
24
from build_a_long.pdf_extract.fixtures import (
1✔
25
    FIXTURES_DIR,
26
    RAW_FIXTURE_FILES,
27
    extract_element_id,
28
    load_classifier_config,
29
)
30

31
log = logging.getLogger(__name__)
1✔
32

33

34
def _load_pages_from_fixture(fixture_file: str) -> list[PageData]:
1✔
35
    """Load all pages from a fixture file.
36

37
    Args:
38
        fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')
39

40
    Returns:
41
        All pages from the extraction result
42

43
    Raises:
44
        ValueError: If the fixture contains no pages
45
    """
46
    fixture_path = FIXTURES_DIR / fixture_file
1✔
47
    extraction: ExtractionResult = ExtractionResult.model_validate_json(
1✔
48
        fixture_path.read_text()
49
    )  # type: ignore[assignment]
50

51
    if not extraction.pages:
1✔
52
        raise ValueError(f"No pages found in {fixture_file}")
×
53

54
    return extraction.pages
1✔
55

56

57
def _load_config_for_fixture(fixture_file: str) -> ClassifierConfig:
1✔
58
    """Load classifier config with hints for a fixture file.
59

60
    Args:
61
        fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')
62

63
    Returns:
64
        ClassifierConfig with font_size_hints and page_hints loaded from fixtures.
65
    """
66
    element_id = extract_element_id(fixture_file)
1✔
67
    return load_classifier_config(element_id)
1✔
68

69

70
class TestClassifierRules:
1✔
71
    """End-to-end rules that must hold on real pages after classification."""
72

73
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
74
    def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:
1✔
75
        """No element with a label should be marked as deleted.
76

77
        If an element has been classified with a label, it should not be deleted.
78
        This ensures that the classification and deletion logic don't conflict.
79
        """
80
        pages: list[PageData] = _load_pages_from_fixture(fixture_file)
×
81
        config = _load_config_for_fixture(fixture_file)
×
82

83
        for page_idx, page in enumerate(pages):
×
84
            # Run the full classification pipeline on the page
85
            result = classify_elements(page, config)
×
86

87
            # Find all elements that are both labeled and deleted
88
            # Build a map of source_block -> label for successfully constructed
89
            # candidates
90
            block_to_label: dict[int, str] = {}
×
91
            for label, candidates in result.get_all_candidates().items():
×
92
                for candidate in candidates:
×
93
                    if candidate.constructed is not None and candidate.source_blocks:
×
94
                        for block in candidate.source_blocks:
×
95
                            block_to_label[id(block)] = label
×
96

97
            labeled_and_deleted = []
×
98
            for elem in page.blocks:
×
99
                if id(elem) in block_to_label and result.is_removed(elem):
×
100
                    labeled_and_deleted.append((elem, block_to_label[id(elem)]))
×
101

102
            if labeled_and_deleted:
×
103
                log.error(
×
104
                    f"Found {len(labeled_and_deleted)} labeled elements "
105
                    f"that are deleted:"
106
                )
107
                for elem, label in labeled_and_deleted:
×
108
                    log.error(f"  - {label} id:{elem.id} bbox:{elem.bbox} [DELETED]")
×
109

110
            assert len(labeled_and_deleted) == 0, (
×
111
                f"Found {len(labeled_and_deleted)} labeled elements that are "
112
                f"deleted in {fixture_file} page {page_idx}. "
113
                f"Labeled elements should not be deleted."
114
            )
115

116
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
117
    def test_each_source_block_maps_to_one_element(self, fixture_file: str) -> None:
1✔
118
        """Each source block should map to at most one LegoPageElement.
119

120
        This validates that the classification pipeline doesn't create duplicate
121
        elements from the same source block. Each raw extraction block should
122
        produce at most one classified element in the final Page tree.
123
        """
124
        pages = _load_pages_from_fixture(fixture_file)
×
125
        config = _load_config_for_fixture(fixture_file)
×
126

127
        for page_idx, page_data in enumerate(pages):
×
128
            # Run the full classification pipeline on the page
129
            result = classify_elements(page_data, config)
×
130
            page = result.page
×
131

132
            if page is None:
×
133
                continue
×
134

135
            # Get all candidates from the classification result
136
            all_candidates = result.get_all_candidates()
×
137

138
            # Build a mapping from constructed element ID to candidate
139
            element_id_to_candidate: dict[int, Candidate] = {}
×
140
            for _label, candidates in all_candidates.items():
×
141
                for candidate in candidates:
×
142
                    if candidate.constructed is not None:
×
143
                        elem_id = id(candidate.constructed)
×
NEW
144
                        src_id = (
×
145
                            id(candidate.source_blocks[0])
146
                            if candidate.source_blocks
147
                            else "None"
148
                        )
UNCOV
149
                        assert elem_id not in element_id_to_candidate, (
×
150
                            f"Source block id:{src_id} "
151
                            f"produced multiple elements of type "
152
                            f"{candidate.constructed.__class__.__name__} "
153
                            f"in {fixture_file} page {page_idx}"
154
                        )
155
                        element_id_to_candidate[elem_id] = candidate
×
156

157
            blocks_to_element: dict[int, LegoPageElement] = {}
×
158

159
            # Traverse all LegoPageElements in the Page tree
160
            for element in page.iter_elements():
×
161
                elem_id = id(element)
×
162

163
                # Skip synthetic/fallback elements that weren't created by candidates
164
                # (e.g., empty PartsLists created when Step has no parts_list)
165
                if elem_id not in element_id_to_candidate:
×
166
                    continue
×
167

168
                candidate = element_id_to_candidate[elem_id]
×
169

170
                for source_block in candidate.source_blocks:
×
171
                    if source_block.id in blocks_to_element:
×
172
                        existing_element = blocks_to_element[source_block.id]
×
173
                        assert source_block.id not in blocks_to_element, (
×
174
                            f"Source block id:{source_block.id} "
175
                            f"({source_block.tag}) mapped to multiple "
176
                            f"elements in {fixture_file} page "
177
                            f"{page_data.page_number}:\n"
178
                            f"  First:  {existing_element}\n"
179
                            f"  Second: {element}\n"
180
                            f"  Source: {source_block}"
181
                        )
182
                    blocks_to_element[source_block.id] = element
×
183

184
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
185
    def test_all_lego_elements_come_from_candidates(self, fixture_file: str) -> None:
1✔
186
        """All LegoPageElements in the final Page tree must come from candidates.
187

188
        This validates that classifiers don't create "orphan" elements directly
189
        without a corresponding candidate. Every LegoPageElement should be either:
190
        1. The constructed element of a candidate, or
191
        2. A synthetic/fallback element (e.g., empty PartsList when Step has no
192
           parts_list candidate)
193

194
        Ensures proper tracking of all elements through the classification pipeline.
195
        """
196
        pages = _load_pages_from_fixture(fixture_file)
×
197
        config = _load_config_for_fixture(fixture_file)
×
198

199
        for page_idx, page_data in enumerate(pages):
×
200
            # Run the full classification pipeline on the page
201
            result = classify_elements(page_data, config)
×
202
            page = result.page
×
203

204
            if page is None:
×
205
                continue
×
206

207
            # Build a set of all constructed element IDs from candidates
208
            all_candidates = result.get_all_candidates()
×
209
            constructed_element_ids: set[int] = set()
×
210
            for _label, candidates in all_candidates.items():
×
211
                for candidate in candidates:
×
212
                    if candidate.constructed is not None:
×
213
                        constructed_element_ids.add(id(candidate.constructed))
×
214

215
            # Traverse all LegoPageElements in the Page tree
216
            orphan_elements: list[tuple[LegoPageElement, str]] = []
×
217
            for element in page.iter_elements():
×
218
                elem_id = id(element)
×
219
                elem_type = element.__class__.__name__
×
220

221
                # Skip the Page itself (it's the root container)
222
                if elem_type == "Page":
×
223
                    continue
×
224

225
                # Check if this element came from a candidate
226
                if elem_id not in constructed_element_ids:
×
227
                    # TODO Remove the following lines
228
                    # Known synthetic/fallback elements that are expected:
229
                    # - Empty PartsList when Step has no parts_list candidate
230
                    # - Diagram when Step couldn't find a matching diagram candidate
231
                    if isinstance(element, PartsList) and len(element.parts) == 0:
×
232
                        continue
×
233
                    if isinstance(element, Diagram):
×
234
                        # Fallback diagrams are allowed when StepClassifier
235
                        # can't find a matching diagram candidate
236
                        continue
×
237

238
                    orphan_elements.append((element, elem_type))
×
239

240
            if orphan_elements:
×
241
                log.error(
×
242
                    f"Found {len(orphan_elements)} orphan elements not from "
243
                    f"candidates in {fixture_file} page {page_idx}:"
244
                )
245
                for elem, elem_type in orphan_elements:
×
246
                    log.error(f"  - {elem_type} bbox:{elem.bbox}")
×
247

248
            assert len(orphan_elements) == 0, (
×
249
                f"Found {len(orphan_elements)} orphan LegoPageElements not from "
250
                f"candidates in {fixture_file} page {page_idx}. "
251
                f"All elements should come from candidates or be known fallbacks."
252
            )
253

254
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
255
    def test_no_orphaned_constructed_candidates(self, fixture_file: str) -> None:
1✔
256
        """No candidate marked constructed without being in the final tree.
257

258
        This validates the transactional rollback semantics of build():
259
        - If a parent classifier's build() fails, all sub-candidates it built
260
          should be rolled back (constructed = None)
261
        - Only candidates that are actually used in the final Page tree should
262
          remain marked as constructed
263

264
        This catches bugs where:
265
        1. A classifier builds sub-candidates (e.g., step builds step_number)
266
        2. The classifier then fails (e.g., parts_list build fails)
267
        3. The step_number candidate remains orphaned with constructed set,
268
           but not actually used in the final tree
269
        """
270
        pages = _load_pages_from_fixture(fixture_file)
×
271
        config = _load_config_for_fixture(fixture_file)
×
272

273
        for page_idx, page_data in enumerate(pages):
×
274
            # Run the full classification pipeline on the page
275
            result = classify_elements(page_data, config)
×
276
            page = result.page
×
277

278
            if page is None:
×
279
                continue
×
280

281
            # Build set of all element IDs actually used in the final Page tree
282
            used_element_ids: set[int] = set()
×
283
            for element in page.iter_elements():
×
284
                used_element_ids.add(id(element))
×
285

286
            # Check all candidates for orphaned constructed elements
287
            all_candidates = result.get_all_candidates()
×
288
            orphaned_candidates: list[tuple[str, Candidate]] = []
×
289

290
            for label, candidates in all_candidates.items():
×
291
                for candidate in candidates:
×
292
                    # If candidate is marked as constructed but not in the tree
293
                    if (
×
294
                        candidate.constructed is not None
295
                        and id(candidate.constructed) not in used_element_ids
296
                    ):
297
                        orphaned_candidates.append((label, candidate))
×
298

299
            if orphaned_candidates:
×
300
                log.error(
×
301
                    f"Found {len(orphaned_candidates)} orphaned constructed "
302
                    f"candidates in {fixture_file} page {page_idx}:"
303
                )
304
                for label, candidate in orphaned_candidates:
×
305
                    elem_type = candidate.constructed.__class__.__name__
×
306
                    log.error(
×
307
                        f"  - {label}: {elem_type} bbox:{candidate.bbox} "
308
                        f"score:{candidate.score:.3f} "
309
                        f"failure:{candidate.failure_reason}"
310
                    )
311

312
            assert len(orphaned_candidates) == 0, (
×
313
                f"Found {len(orphaned_candidates)} orphaned constructed candidates "
314
                f"in {fixture_file} page {page_idx}. "
315
                f"Candidates marked as constructed should either be in the final "
316
                f"Page tree or rolled back to constructed=None. "
317
                f"This indicates a transactional rollback failure."
318
            )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc