20361865516

Committed 19 Dec 2025 06:25AM UTC coverage: 89.13% (-0.002%) from 89.132%

Build # 20361865516

Build Type

push

github

Committed by

bramp

Commit Message

Fix lint errors: line length, unused imports, and YAML issues

- Add ruff isort configuration with known-first-party for build_a_long
- Add per-file E501 ignore for legocom_test.py (JSON test data)
- Create .yamllint config to relax strict YAML rules
- Fix E501 line length errors by wrapping long comments and strings
- Fix F841 unused variable errors
- Fix PLC0415 import-at-non-top-level errors
- Fix SIM108 ternary simplification errors

Run Details

12 of 14 new or added lines in 8 files covered. (85.71%)

78 existing lines in 6 files now uncovered.

12915 of 14490 relevant lines covered (89.13%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

20.16

/src/build_a_long/pdf_extract/classifier/classifier_rules_test.py

"""Rule-based tests over real fixtures for the PDF element classifier.

This suite validates high-level invariants that must hold after classification.

Rules covered:
- No labeled element should be marked as deleted.
- Each element has at most one winner candidate.

Real fixture(s) live under this package's fixtures/ directory.
"""

import logging

import pytest

from build_a_long.pdf_extract.classifier import Candidate, classify_elements
from build_a_long.pdf_extract.classifier.classifier_config import ClassifierConfig
from build_a_long.pdf_extract.extractor import ExtractionResult, PageData
from build_a_long.pdf_extract.extractor.lego_page_elements import (
    Diagram,
    LegoPageElement,
    PartsList,
)
from build_a_long.pdf_extract.fixtures import (
    FIXTURES_DIR,
    RAW_FIXTURE_FILES,
    extract_element_id,
    load_classifier_config,
)

log = logging.getLogger(__name__)


def _load_pages_from_fixture(fixture_file: str) -> list[PageData]:
    """Load all pages from a fixture file.

    Args:
        fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')

    Returns:
        All pages from the extraction result

    Raises:
        ValueError: If the fixture contains no pages
    """
    fixture_path = FIXTURES_DIR / fixture_file
    extraction: ExtractionResult = ExtractionResult.model_validate_json(
        fixture_path.read_text()
    )  # type: ignore[assignment]

    if not extraction.pages:
        raise ValueError(f"No pages found in {fixture_file}")

    return extraction.pages


def _load_config_for_fixture(fixture_file: str) -> ClassifierConfig:
    """Load classifier config with hints for a fixture file.

    Args:
        fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')

    Returns:
        ClassifierConfig with font_size_hints and page_hints loaded from fixtures.
    """
    element_id = extract_element_id(fixture_file)
    return load_classifier_config(element_id)


class TestClassifierRules:
    """End-to-end rules that must hold on real pages after classification."""

    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
    def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:
        """No element with a label should be marked as deleted.

        If an element has been classified with a label, it should not be deleted.
        This ensures that the classification and deletion logic don't conflict.
        """
        pages: list[PageData] = _load_pages_from_fixture(fixture_file)
        config = _load_config_for_fixture(fixture_file)

        for page_idx, page in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page, config)

            # Find all elements that are both labeled and deleted
            # Build a map of source_block -> label for successfully constructed
            # candidates
            block_to_label: dict[int, str] = {}
            for label, candidates in result.get_all_candidates().items():
                for candidate in candidates:
                    if candidate.constructed is not None and candidate.source_blocks:
                        for block in candidate.source_blocks:
                            block_to_label[id(block)] = label

            labeled_and_deleted = []
            for elem in page.blocks:
                if id(elem) in block_to_label and result.is_removed(elem):
                    labeled_and_deleted.append((elem, block_to_label[id(elem)]))

            if labeled_and_deleted:
                log.error(
                    f"Found {len(labeled_and_deleted)} labeled elements "
                    f"that are deleted:"
                )
                for elem, label in labeled_and_deleted:
                    log.error(f"  - {label} id:{elem.id} bbox:{elem.bbox} [DELETED]")

            assert len(labeled_and_deleted) == 0, (
                f"Found {len(labeled_and_deleted)} labeled elements that are "
                f"deleted in {fixture_file} page {page_idx}. "
                f"Labeled elements should not be deleted."
            )

    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
    def test_each_source_block_maps_to_one_element(self, fixture_file: str) -> None:
        """Each source block should map to at most one LegoPageElement.

        This validates that the classification pipeline doesn't create duplicate
        elements from the same source block. Each raw extraction block should
        produce at most one classified element in the final Page tree.
        """
        pages = _load_pages_from_fixture(fixture_file)
        config = _load_config_for_fixture(fixture_file)

        for page_idx, page_data in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page_data, config)
            page = result.page

            if page is None:
                continue

            # Get all candidates from the classification result
            all_candidates = result.get_all_candidates()

            # Build a mapping from constructed element ID to candidate
            element_id_to_candidate: dict[int, Candidate] = {}
            for _label, candidates in all_candidates.items():
                for candidate in candidates:
                    if candidate.constructed is not None:
                        elem_id = id(candidate.constructed)
                        src_id = (
                            id(candidate.source_blocks[0])
                            if candidate.source_blocks
                            else "None"
                        )
                        assert elem_id not in element_id_to_candidate, (
                            f"Source block id:{src_id} "
                            f"produced multiple elements of type "
                            f"{candidate.constructed.__class__.__name__} "
                            f"in {fixture_file} page {page_idx}"
                        )
                        element_id_to_candidate[elem_id] = candidate

            blocks_to_element: dict[int, LegoPageElement] = {}

            # Traverse all LegoPageElements in the Page tree
            for element in page.iter_elements():
                elem_id = id(element)

                # Skip synthetic/fallback elements that weren't created by candidates
                # (e.g., empty PartsLists created when Step has no parts_list)
                if elem_id not in element_id_to_candidate:
                    continue

                candidate = element_id_to_candidate[elem_id]

                for source_block in candidate.source_blocks:
                    if source_block.id in blocks_to_element:
                        existing_element = blocks_to_element[source_block.id]
                        assert source_block.id not in blocks_to_element, (
                            f"Source block id:{source_block.id} "
                            f"({source_block.tag}) mapped to multiple "
                            f"elements in {fixture_file} page "
                            f"{page_data.page_number}:\n"
                            f"  First:  {existing_element}\n"
                            f"  Second: {element}\n"
                            f"  Source: {source_block}"
                        )
                    blocks_to_element[source_block.id] = element

    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
    def test_all_lego_elements_come_from_candidates(self, fixture_file: str) -> None:
        """All LegoPageElements in the final Page tree must come from candidates.

        This validates that classifiers don't create "orphan" elements directly
        without a corresponding candidate. Every LegoPageElement should be either:
        1. The constructed element of a candidate, or
        2. A synthetic/fallback element (e.g., empty PartsList when Step has no
           parts_list candidate)

        Ensures proper tracking of all elements through the classification pipeline.
        """
        pages = _load_pages_from_fixture(fixture_file)
        config = _load_config_for_fixture(fixture_file)

        for page_idx, page_data in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page_data, config)
            page = result.page

            if page is None:
                continue

            # Build a set of all constructed element IDs from candidates
            all_candidates = result.get_all_candidates()
            constructed_element_ids: set[int] = set()
            for _label, candidates in all_candidates.items():
                for candidate in candidates:
                    if candidate.constructed is not None:
                        constructed_element_ids.add(id(candidate.constructed))

            # Traverse all LegoPageElements in the Page tree
            orphan_elements: list[tuple[LegoPageElement, str]] = []
            for element in page.iter_elements():
                elem_id = id(element)
                elem_type = element.__class__.__name__

                # Skip the Page itself (it's the root container)
                if elem_type == "Page":
                    continue

                # Check if this element came from a candidate
                if elem_id not in constructed_element_ids:
                    # TODO Remove the following lines
                    # Known synthetic/fallback elements that are expected:
                    # - Empty PartsList when Step has no parts_list candidate
                    # - Diagram when Step couldn't find a matching diagram candidate
                    if isinstance(element, PartsList) and len(element.parts) == 0:
                        continue
                    if isinstance(element, Diagram):
                        # Fallback diagrams are allowed when StepClassifier
                        # can't find a matching diagram candidate
                        continue

                    orphan_elements.append((element, elem_type))

            if orphan_elements:
                log.error(
                    f"Found {len(orphan_elements)} orphan elements not from "
                    f"candidates in {fixture_file} page {page_idx}:"
                )
                for elem, elem_type in orphan_elements:
                    log.error(f"  - {elem_type} bbox:{elem.bbox}")

            assert len(orphan_elements) == 0, (
                f"Found {len(orphan_elements)} orphan LegoPageElements not from "
                f"candidates in {fixture_file} page {page_idx}. "
                f"All elements should come from candidates or be known fallbacks."
            )

    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
    def test_no_orphaned_constructed_candidates(self, fixture_file: str) -> None:
        """No candidate marked constructed without being in the final tree.

        This validates the transactional rollback semantics of build():
        - If a parent classifier's build() fails, all sub-candidates it built
          should be rolled back (constructed = None)
        - Only candidates that are actually used in the final Page tree should
          remain marked as constructed

        This catches bugs where:
        1. A classifier builds sub-candidates (e.g., step builds step_number)
        2. The classifier then fails (e.g., parts_list build fails)
        3. The step_number candidate remains orphaned with constructed set,
           but not actually used in the final tree
        """
        pages = _load_pages_from_fixture(fixture_file)
        config = _load_config_for_fixture(fixture_file)

        for page_idx, page_data in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page_data, config)
            page = result.page

            if page is None:
                continue

            # Build set of all element IDs actually used in the final Page tree
            used_element_ids: set[int] = set()
            for element in page.iter_elements():
                used_element_ids.add(id(element))

            # Check all candidates for orphaned constructed elements
            all_candidates = result.get_all_candidates()
            orphaned_candidates: list[tuple[str, Candidate]] = []

            for label, candidates in all_candidates.items():
                for candidate in candidates:
                    # If candidate is marked as constructed but not in the tree
                    if (
                        candidate.constructed is not None
                        and id(candidate.constructed) not in used_element_ids
                    ):
                        orphaned_candidates.append((label, candidate))

            if orphaned_candidates:
                log.error(
                    f"Found {len(orphaned_candidates)} orphaned constructed "
                    f"candidates in {fixture_file} page {page_idx}:"
                )
                for label, candidate in orphaned_candidates:
                    elem_type = candidate.constructed.__class__.__name__
                    log.error(
                        f"  - {label}: {elem_type} bbox:{candidate.bbox} "
                        f"score:{candidate.score:.3f} "
                        f"failure:{candidate.failure_reason}"
                    )

            assert len(orphaned_candidates) == 0, (
                f"Found {len(orphaned_candidates)} orphaned constructed candidates "
                f"in {fixture_file} page {page_idx}. "
                f"Candidates marked as constructed should either be in the final "
                f"Page tree or rolled back to constructed=None. "
                f"This indicates a transactional rollback failure."
            )

1	"""Rule-based tests over real fixtures for the PDF element classifier.
2
3	This suite validates high-level invariants that must hold after classification.
4
5	Rules covered:
6	- No labeled element should be marked as deleted.
7	- Each element has at most one winner candidate.
8
9	Real fixture(s) live under this package's fixtures/ directory.
10	"""
11
12	import logging	1✔
13
14	import pytest	1✔
15
16	from build_a_long.pdf_extract.classifier import Candidate, classify_elements	1✔
17	from build_a_long.pdf_extract.classifier.classifier_config import ClassifierConfig	1✔
18	from build_a_long.pdf_extract.extractor import ExtractionResult, PageData	1✔
19	from build_a_long.pdf_extract.extractor.lego_page_elements import (	1✔
20	Diagram,
21	LegoPageElement,
22	PartsList,
23	)
24	from build_a_long.pdf_extract.fixtures import (	1✔
25	FIXTURES_DIR,
26	RAW_FIXTURE_FILES,
27	extract_element_id,
28	load_classifier_config,
29	)
30
31	log = logging.getLogger(__name__)	1✔
32
33
34	def _load_pages_from_fixture(fixture_file: str) -> list[PageData]:	1✔
35	"""Load all pages from a fixture file.
36
37	Args:
38	fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')
39
40	Returns:
41	All pages from the extraction result
42
43	Raises:
44	ValueError: If the fixture contains no pages
45	"""
46	fixture_path = FIXTURES_DIR / fixture_file	1✔
47	extraction: ExtractionResult = ExtractionResult.model_validate_json(	1✔
48	fixture_path.read_text()
49	) # type: ignore[assignment]
50
51	if not extraction.pages:	1✔
52	raise ValueError(f"No pages found in {fixture_file}")	×
53
54	return extraction.pages	1✔
55
56
57	def _load_config_for_fixture(fixture_file: str) -> ClassifierConfig:	1✔
58	"""Load classifier config with hints for a fixture file.
59
60	Args:
61	fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')
62
63	Returns:
64	ClassifierConfig with font_size_hints and page_hints loaded from fixtures.
65	"""
66	element_id = extract_element_id(fixture_file)	1✔
67	return load_classifier_config(element_id)	1✔
68
69
70	class TestClassifierRules:	1✔
71	"""End-to-end rules that must hold on real pages after classification."""
72
73	@pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)	1✔
74	def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:	1✔
75	"""No element with a label should be marked as deleted.
76
77	If an element has been classified with a label, it should not be deleted.
78	This ensures that the classification and deletion logic don't conflict.
79	"""
80	pages: list[PageData] = _load_pages_from_fixture(fixture_file)	×
81	config = _load_config_for_fixture(fixture_file)	×
82
83	for page_idx, page in enumerate(pages):	×
84	# Run the full classification pipeline on the page
85	result = classify_elements(page, config)	×
86
87	# Find all elements that are both labeled and deleted
88	# Build a map of source_block -> label for successfully constructed
89	# candidates
90	block_to_label: dict[int, str] = {}	×
91	for label, candidates in result.get_all_candidates().items():	×
92	for candidate in candidates:	×
93	if candidate.constructed is not None and candidate.source_blocks:	×
94	for block in candidate.source_blocks:	×
95	block_to_label[id(block)] = label	×
96
97	labeled_and_deleted = []	×
98	for elem in page.blocks:	×
99	if id(elem) in block_to_label and result.is_removed(elem):	×
100	labeled_and_deleted.append((elem, block_to_label[id(elem)]))	×
101
102	if labeled_and_deleted:	×
103	log.error(	×
104	f"Found {len(labeled_and_deleted)} labeled elements "
105	f"that are deleted:"
106	)
107	for elem, label in labeled_and_deleted:	×
108	log.error(f" - {label} id:{elem.id} bbox:{elem.bbox} [DELETED]")	×
109
110	assert len(labeled_and_deleted) == 0, (	×
111	f"Found {len(labeled_and_deleted)} labeled elements that are "
112	f"deleted in {fixture_file} page {page_idx}. "
113	f"Labeled elements should not be deleted."
114	)
115
116	@pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)	1✔
117	def test_each_source_block_maps_to_one_element(self, fixture_file: str) -> None:	1✔
118	"""Each source block should map to at most one LegoPageElement.
119
120	This validates that the classification pipeline doesn't create duplicate
121	elements from the same source block. Each raw extraction block should
122	produce at most one classified element in the final Page tree.
123	"""
124	pages = _load_pages_from_fixture(fixture_file)	×
125	config = _load_config_for_fixture(fixture_file)	×
126
127	for page_idx, page_data in enumerate(pages):	×
128	# Run the full classification pipeline on the page
129	result = classify_elements(page_data, config)	×
130	page = result.page	×
131
132	if page is None:	×
133	continue	×
134
135	# Get all candidates from the classification result
136	all_candidates = result.get_all_candidates()	×
137
138	# Build a mapping from constructed element ID to candidate
139	element_id_to_candidate: dict[int, Candidate] = {}	×
140	for _label, candidates in all_candidates.items():	×
141	for candidate in candidates:	×
142	if candidate.constructed is not None:	×
143	elem_id = id(candidate.constructed)	×
NEW 144	src_id = (	×
145	id(candidate.source_blocks[0])
146	if candidate.source_blocks
147	else "None"
148	)
UNCOV 149	assert elem_id not in element_id_to_candidate, (	×
150	f"Source block id:{src_id} "
151	f"produced multiple elements of type "
152	f"{candidate.constructed.__class__.__name__} "
153	f"in {fixture_file} page {page_idx}"
154	)
155	element_id_to_candidate[elem_id] = candidate	×
156
157	blocks_to_element: dict[int, LegoPageElement] = {}	×
158
159	# Traverse all LegoPageElements in the Page tree
160	for element in page.iter_elements():	×
161	elem_id = id(element)	×
162
163	# Skip synthetic/fallback elements that weren't created by candidates
164	# (e.g., empty PartsLists created when Step has no parts_list)
165	if elem_id not in element_id_to_candidate:	×
166	continue	×
167
168	candidate = element_id_to_candidate[elem_id]	×
169
170	for source_block in candidate.source_blocks:	×
171	if source_block.id in blocks_to_element:	×
172	existing_element = blocks_to_element[source_block.id]	×
173	assert source_block.id not in blocks_to_element, (	×
174	f"Source block id:{source_block.id} "
175	f"({source_block.tag}) mapped to multiple "
176	f"elements in {fixture_file} page "
177	f"{page_data.page_number}:\n"
178	f" First: {existing_element}\n"
179	f" Second: {element}\n"
180	f" Source: {source_block}"
181	)
182	blocks_to_element[source_block.id] = element	×
183
184	@pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)	1✔
185	def test_all_lego_elements_come_from_candidates(self, fixture_file: str) -> None:	1✔
186	"""All LegoPageElements in the final Page tree must come from candidates.
187
188	This validates that classifiers don't create "orphan" elements directly
189	without a corresponding candidate. Every LegoPageElement should be either:
190	1. The constructed element of a candidate, or
191	2. A synthetic/fallback element (e.g., empty PartsList when Step has no
192	parts_list candidate)
193
194	Ensures proper tracking of all elements through the classification pipeline.
195	"""
196	pages = _load_pages_from_fixture(fixture_file)	×
197	config = _load_config_for_fixture(fixture_file)	×
198
199	for page_idx, page_data in enumerate(pages):	×
200	# Run the full classification pipeline on the page
201	result = classify_elements(page_data, config)	×
202	page = result.page	×
203
204	if page is None:	×
205	continue	×
206
207	# Build a set of all constructed element IDs from candidates
208	all_candidates = result.get_all_candidates()	×
209	constructed_element_ids: set[int] = set()	×
210	for _label, candidates in all_candidates.items():	×
211	for candidate in candidates:	×
212	if candidate.constructed is not None:	×
213	constructed_element_ids.add(id(candidate.constructed))	×
214
215	# Traverse all LegoPageElements in the Page tree
216	orphan_elements: list[tuple[LegoPageElement, str]] = []	×
217	for element in page.iter_elements():	×
218	elem_id = id(element)	×
219	elem_type = element.__class__.__name__	×
220
221	# Skip the Page itself (it's the root container)
222	if elem_type == "Page":	×
223	continue	×
224
225	# Check if this element came from a candidate
226	if elem_id not in constructed_element_ids:	×
227	# TODO Remove the following lines
228	# Known synthetic/fallback elements that are expected:
229	# - Empty PartsList when Step has no parts_list candidate
230	# - Diagram when Step couldn't find a matching diagram candidate
231	if isinstance(element, PartsList) and len(element.parts) == 0:	×
232	continue	×
233	if isinstance(element, Diagram):	×
234	# Fallback diagrams are allowed when StepClassifier
235	# can't find a matching diagram candidate
236	continue	×
237
238	orphan_elements.append((element, elem_type))	×
239
240	if orphan_elements:	×
241	log.error(	×
242	f"Found {len(orphan_elements)} orphan elements not from "
243	f"candidates in {fixture_file} page {page_idx}:"
244	)
245	for elem, elem_type in orphan_elements:	×
246	log.error(f" - {elem_type} bbox:{elem.bbox}")	×
247
248	assert len(orphan_elements) == 0, (	×
249	f"Found {len(orphan_elements)} orphan LegoPageElements not from "
250	f"candidates in {fixture_file} page {page_idx}. "
251	f"All elements should come from candidates or be known fallbacks."
252	)
253
254	@pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)	1✔
255	def test_no_orphaned_constructed_candidates(self, fixture_file: str) -> None:	1✔
256	"""No candidate marked constructed without being in the final tree.
257
258	This validates the transactional rollback semantics of build():
259	- If a parent classifier's build() fails, all sub-candidates it built
260	should be rolled back (constructed = None)
261	- Only candidates that are actually used in the final Page tree should
262	remain marked as constructed
263
264	This catches bugs where:
265	1. A classifier builds sub-candidates (e.g., step builds step_number)
266	2. The classifier then fails (e.g., parts_list build fails)
267	3. The step_number candidate remains orphaned with constructed set,
268	but not actually used in the final tree
269	"""
270	pages = _load_pages_from_fixture(fixture_file)	×
271	config = _load_config_for_fixture(fixture_file)	×
272
273	for page_idx, page_data in enumerate(pages):	×
274	# Run the full classification pipeline on the page
275	result = classify_elements(page_data, config)	×
276	page = result.page	×
277
278	if page is None:	×
279	continue	×
280
281	# Build set of all element IDs actually used in the final Page tree
282	used_element_ids: set[int] = set()	×
283	for element in page.iter_elements():	×
284	used_element_ids.add(id(element))	×
285
286	# Check all candidates for orphaned constructed elements
287	all_candidates = result.get_all_candidates()	×
288	orphaned_candidates: list[tuple[str, Candidate]] = []	×
289
290	for label, candidates in all_candidates.items():	×
291	for candidate in candidates:	×
292	# If candidate is marked as constructed but not in the tree
293	if (	×
294	candidate.constructed is not None
295	and id(candidate.constructed) not in used_element_ids
296	):
297	orphaned_candidates.append((label, candidate))	×
298
299	if orphaned_candidates:	×
300	log.error(	×
301	f"Found {len(orphaned_candidates)} orphaned constructed "
302	f"candidates in {fixture_file} page {page_idx}:"
303	)
304	for label, candidate in orphaned_candidates:	×
305	elem_type = candidate.constructed.__class__.__name__	×
306	log.error(	×
307	f" - {label}: {elem_type} bbox:{candidate.bbox} "
308	f"score:{candidate.score:.3f} "
309	f"failure:{candidate.failure_reason}"
310	)
311
312	assert len(orphaned_candidates) == 0, (	×
313	f"Found {len(orphaned_candidates)} orphaned constructed candidates "
314	f"in {fixture_file} page {page_idx}. "
315	f"Candidates marked as constructed should either be in the final "
316	f"Page tree or rolled back to constructed=None. "
317	f"This indicates a transactional rollback failure."
318	)

bramp / build-along / 20361865516

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous