19727090583

Committed 27 Nov 2025 06:15AM UTC coverage: 89.781% (+0.8%) from 88.977%

Build # 19727090583

Build Type

push

github

Committed by

bramp

Commit Message

Multiple improves to classifers, specific around documentations, removing unused fields, and improving type hinting.

Run Details

26 of 26 new or added lines in 14 files covered. (100.0%)

94 existing lines in 17 files now uncovered.

7327 of 8161 relevant lines covered (89.78%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

83.9

/src/build_a_long/pdf_extract/classifier/classifier_rules_test.py

"""Rule-based tests over real fixtures for the PDF element classifier.

This suite validates high-level invariants that must hold after classification.

Rules covered:
- No labeled element should be marked as deleted.
- Each element has at most one winner candidate.

Real fixture(s) live under this package's fixtures/ directory.
"""

import logging

import pytest

from build_a_long.pdf_extract.classifier import classify_elements
from build_a_long.pdf_extract.classifier.classification_result import Candidate
from build_a_long.pdf_extract.extractor import ExtractionResult, PageData
from build_a_long.pdf_extract.extractor.lego_page_elements import (
    Diagram,
    LegoPageElement,
    PartsList,
)
from build_a_long.pdf_extract.fixtures import FIXTURES_DIR, RAW_FIXTURE_FILES

log = logging.getLogger(__name__)


def _load_pages_from_fixture(fixture_file: str) -> list[PageData]:
    """Load all pages from a fixture file.

    Args:
        fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')

    Returns:
        All pages from the extraction result

    Raises:
        ValueError: If the fixture contains no pages
    """
    fixture_path = FIXTURES_DIR / fixture_file
    extraction: ExtractionResult = ExtractionResult.model_validate_json(
        fixture_path.read_text()
    )  # type: ignore[assignment]

    if not extraction.pages:
        raise ValueError(f"No pages found in {fixture_file}")

    return extraction.pages


class TestClassifierRules:
    """End-to-end rules that must hold on real pages after classification."""

    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
    def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:
        """No element with a label should be marked as deleted.

        If an element has been classified with a label, it should not be deleted.
        This ensures that the classification and deletion logic don't conflict.
        """
        pages: list[PageData] = _load_pages_from_fixture(fixture_file)

        for page_idx, page in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page)

            # Find all elements that are both labeled and deleted
            # Build a map of source_block -> label for successfully constructed
            # candidates
            block_to_label: dict[int, str] = {}
            for label, candidates in result.get_all_candidates().items():
                for candidate in candidates:
                    if candidate.constructed is not None and candidate.source_blocks:
                        for block in candidate.source_blocks:
                            block_to_label[id(block)] = label

            labeled_and_deleted = []
            for elem in page.blocks:
                if id(elem) in block_to_label and result.is_removed(elem):
                    labeled_and_deleted.append((elem, block_to_label[id(elem)]))

            if labeled_and_deleted:
                log.error(
                    f"Found {len(labeled_and_deleted)} labeled elements "
                    f"that are deleted:"
                )
                for elem, label in labeled_and_deleted:
                    log.error(f"  - {label} id:{elem.id} bbox:{elem.bbox} [DELETED]")

            assert len(labeled_and_deleted) == 0, (
                f"Found {len(labeled_and_deleted)} labeled elements that are "
                f"deleted in {fixture_file} page {page_idx}. "
                f"Labeled elements should not be deleted."
            )

    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
    def test_each_source_block_maps_to_one_element(self, fixture_file: str) -> None:
        """Each source block should map to at most one LegoPageElement.

        This validates that the classification pipeline doesn't create duplicate
        elements from the same source block. Each raw extraction block should
        produce at most one classified element in the final Page tree.
        """
        # TODO: Remove this skip once the "winning" concept is implemented
        # These fixtures have Parts that appear in multiple PartsLists due to
        # overlapping Drawing bboxes. The winning concept will prevent duplicate
        # Part usage across candidates.
        if fixture_file in ["6509377_page_014_raw.json", "6509377_page_015_raw.json"]:
            pytest.skip(
                "Skipping until 'winning' concept prevents duplicate Part usage "
                "across multiple PartsList candidates"
            )

        pages = _load_pages_from_fixture(fixture_file)

        for page_idx, page_data in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page_data)
            page = result.page

            if page is None:
                continue

            # Get all candidates from the classification result
            all_candidates = result.get_all_candidates()

            # Build a mapping from constructed element ID to candidate
            element_id_to_candidate: dict[int, Candidate] = {}
            for _label, candidates in all_candidates.items():
                for candidate in candidates:
                    if candidate.constructed is not None:
                        elem_id = id(candidate.constructed)
                        assert elem_id not in element_id_to_candidate, (
                            f"Source block id:"
                            f"{id(candidate.source_blocks[0]) if candidate.source_blocks else 'None'} "
                            f"produced multiple elements of type "
                            f"{candidate.constructed.__class__.__name__} "
                            f"in {fixture_file} page {page_idx}"
                        )
                        element_id_to_candidate[elem_id] = candidate

            blocks_to_element: dict[int, LegoPageElement] = {}

            # Traverse all LegoPageElements in the Page tree
            for element in page.iter_elements():
                elem_id = id(element)

                # Skip synthetic/fallback elements that weren't created by candidates
                # (e.g., empty PartsLists created when Step has no parts_list)
                if elem_id not in element_id_to_candidate:
                    continue

                candidate = element_id_to_candidate[elem_id]

                for source_block in candidate.source_blocks:
                    if source_block.id in blocks_to_element:
                        existing_element = blocks_to_element[source_block.id]
                        assert source_block.id not in blocks_to_element, (
                            f"Source block id:{source_block.id} "
                            f"({source_block.tag}) mapped to multiple "
                            f"elements in {fixture_file} page "
                            f"{page_data.page_number}:\n"
                            f"  First:  {existing_element}\n"
                            f"  Second: {element}\n"
                            f"  Source: {source_block}"
                        )
                    blocks_to_element[source_block.id] = element

    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
    def test_all_lego_elements_come_from_candidates(self, fixture_file: str) -> None:
        """All LegoPageElements in the final Page tree must come from candidates.

        This validates that classifiers don't create "orphan" elements directly
        without a corresponding candidate. Every LegoPageElement should be either:
        1. The constructed element of a candidate, or
        2. A synthetic/fallback element (e.g., empty PartsList when Step has no
           parts_list candidate)

        Ensures proper tracking of all elements through the classification pipeline.
        """
        pages = _load_pages_from_fixture(fixture_file)

        for page_idx, page_data in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page_data)
            page = result.page

            if page is None:
                continue

            # Build a set of all constructed element IDs from candidates
            all_candidates = result.get_all_candidates()
            constructed_element_ids: set[int] = set()
            for _label, candidates in all_candidates.items():
                for candidate in candidates:
                    if candidate.constructed is not None:
                        constructed_element_ids.add(id(candidate.constructed))

            # Traverse all LegoPageElements in the Page tree
            orphan_elements: list[tuple[LegoPageElement, str]] = []
            for element in page.iter_elements():
                elem_id = id(element)
                elem_type = element.__class__.__name__

                # Skip the Page itself (it's the root container)
                if elem_type == "Page":
                    continue

                # Check if this element came from a candidate
                if elem_id not in constructed_element_ids:
                    # TODO Remove the following lines
                    # Known synthetic/fallback elements that are expected:
                    # - Empty PartsList when Step has no parts_list candidate
                    # - Diagram when Step couldn't find a matching diagram candidate
                    if isinstance(element, PartsList) and len(element.parts) == 0:
                        continue
                    if isinstance(element, Diagram):
                        # Fallback diagrams are allowed when StepClassifier
                        # can't find a matching diagram candidate
                        continue

                    orphan_elements.append((element, elem_type))

            if orphan_elements:
                log.error(
                    f"Found {len(orphan_elements)} orphan elements not from "
                    f"candidates in {fixture_file} page {page_idx}:"
                )
                for elem, elem_type in orphan_elements:
                    log.error(f"  - {elem_type} bbox:{elem.bbox}")

            assert len(orphan_elements) == 0, (
                f"Found {len(orphan_elements)} orphan LegoPageElements not from "
                f"candidates in {fixture_file} page {page_idx}. "
                f"All elements should come from candidates or be known fallbacks."
            )

    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
    def test_no_orphaned_constructed_candidates(self, fixture_file: str) -> None:
        """No candidate marked constructed without being in the final tree.

        This validates the transactional rollback semantics of build():
        - If a parent classifier's build() fails, all sub-candidates it built
          should be rolled back (constructed = None)
        - Only candidates that are actually used in the final Page tree should
          remain marked as constructed

        This catches bugs where:
        1. A classifier builds sub-candidates (e.g., step builds step_number)
        2. The classifier then fails (e.g., parts_list build fails)
        3. The step_number candidate remains orphaned with constructed set,
           but not actually used in the final tree
        """
        pages = _load_pages_from_fixture(fixture_file)

        for page_idx, page_data in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page_data)
            page = result.page

            if page is None:
                continue

            # Build set of all element IDs actually used in the final Page tree
            used_element_ids: set[int] = set()
            for element in page.iter_elements():
                used_element_ids.add(id(element))

            # Check all candidates for orphaned constructed elements
            all_candidates = result.get_all_candidates()
            orphaned_candidates: list[tuple[str, Candidate]] = []

            for label, candidates in all_candidates.items():
                for candidate in candidates:
                    # If candidate is marked as constructed but not in the tree
                    if (
                        candidate.constructed is not None
                        and id(candidate.constructed) not in used_element_ids
                    ):
                        orphaned_candidates.append((label, candidate))

            if orphaned_candidates:
                log.error(
                    f"Found {len(orphaned_candidates)} orphaned constructed "
                    f"candidates in {fixture_file} page {page_idx}:"
                )
                for label, candidate in orphaned_candidates:
                    elem_type = candidate.constructed.__class__.__name__
                    log.error(
                        f"  - {label}: {elem_type} bbox:{candidate.bbox} "
                        f"score:{candidate.score:.3f} "
                        f"failure:{candidate.failure_reason}"
                    )

            assert len(orphaned_candidates) == 0, (
                f"Found {len(orphaned_candidates)} orphaned constructed candidates "
                f"in {fixture_file} page {page_idx}. "
                f"Candidates marked as constructed should either be in the final "
                f"Page tree or rolled back to constructed=None. "
                f"This indicates a transactional rollback failure."
            )

1	"""Rule-based tests over real fixtures for the PDF element classifier.
2
3	This suite validates high-level invariants that must hold after classification.
4
5	Rules covered:
6	- No labeled element should be marked as deleted.
7	- Each element has at most one winner candidate.
8
9	Real fixture(s) live under this package's fixtures/ directory.
10	"""
11
12	import logging	1✔
13
14	import pytest	1✔
15
16	from build_a_long.pdf_extract.classifier import classify_elements	1✔
17	from build_a_long.pdf_extract.classifier.classification_result import Candidate	1✔
18	from build_a_long.pdf_extract.extractor import ExtractionResult, PageData	1✔
19	from build_a_long.pdf_extract.extractor.lego_page_elements import (	1✔
20	Diagram,
21	LegoPageElement,
22	PartsList,
23	)
24	from build_a_long.pdf_extract.fixtures import FIXTURES_DIR, RAW_FIXTURE_FILES	1✔
25
26	log = logging.getLogger(__name__)	1✔
27
28
29	def _load_pages_from_fixture(fixture_file: str) -> list[PageData]:	1✔
30	"""Load all pages from a fixture file.
31
32	Args:
33	fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')
34
35	Returns:
36	All pages from the extraction result
37
38	Raises:
39	ValueError: If the fixture contains no pages
40	"""
41	fixture_path = FIXTURES_DIR / fixture_file	1✔
42	extraction: ExtractionResult = ExtractionResult.model_validate_json(	1✔
43	fixture_path.read_text()
44	) # type: ignore[assignment]
45
46	if not extraction.pages:	1✔
47	raise ValueError(f"No pages found in {fixture_file}")	×
48
49	return extraction.pages	1✔
50
51
52	class TestClassifierRules:	1✔
53	"""End-to-end rules that must hold on real pages after classification."""
54
55	@pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)	1✔
56	def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:	1✔
57	"""No element with a label should be marked as deleted.
58
59	If an element has been classified with a label, it should not be deleted.
60	This ensures that the classification and deletion logic don't conflict.
61	"""
62	pages: list[PageData] = _load_pages_from_fixture(fixture_file)	1✔
63
64	for page_idx, page in enumerate(pages):	1✔
65	# Run the full classification pipeline on the page
66	result = classify_elements(page)	1✔
67
68	# Find all elements that are both labeled and deleted
69	# Build a map of source_block -> label for successfully constructed
70	# candidates
71	block_to_label: dict[int, str] = {}	1✔
72	for label, candidates in result.get_all_candidates().items():	1✔
73	for candidate in candidates:	1✔
74	if candidate.constructed is not None and candidate.source_blocks:	1✔
75	for block in candidate.source_blocks:	1✔
76	block_to_label[id(block)] = label	1✔
77
78	labeled_and_deleted = []	1✔
79	for elem in page.blocks:	1✔
80	if id(elem) in block_to_label and result.is_removed(elem):	1✔
81	labeled_and_deleted.append((elem, block_to_label[id(elem)]))	×
82
83	if labeled_and_deleted:	1✔
84	log.error(	×
85	f"Found {len(labeled_and_deleted)} labeled elements "
86	f"that are deleted:"
87	)
88	for elem, label in labeled_and_deleted:	×
89	log.error(f" - {label} id:{elem.id} bbox:{elem.bbox} [DELETED]")	×
90
91	assert len(labeled_and_deleted) == 0, (	1✔
92	f"Found {len(labeled_and_deleted)} labeled elements that are "
93	f"deleted in {fixture_file} page {page_idx}. "
94	f"Labeled elements should not be deleted."
95	)
96
97	@pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)	1✔
98	def test_each_source_block_maps_to_one_element(self, fixture_file: str) -> None:	1✔
99	"""Each source block should map to at most one LegoPageElement.
100
101	This validates that the classification pipeline doesn't create duplicate
102	elements from the same source block. Each raw extraction block should
103	produce at most one classified element in the final Page tree.
104	"""
105	# TODO: Remove this skip once the "winning" concept is implemented
106	# These fixtures have Parts that appear in multiple PartsLists due to
107	# overlapping Drawing bboxes. The winning concept will prevent duplicate
108	# Part usage across candidates.
109	if fixture_file in ["6509377_page_014_raw.json", "6509377_page_015_raw.json"]:	1✔
110	pytest.skip(	1✔
111	"Skipping until 'winning' concept prevents duplicate Part usage "
112	"across multiple PartsList candidates"
113	)
114
115	pages = _load_pages_from_fixture(fixture_file)	1✔
116
117	for page_idx, page_data in enumerate(pages):	1✔
118	# Run the full classification pipeline on the page
119	result = classify_elements(page_data)	1✔
120	page = result.page	1✔
121
122	if page is None:	1✔
123	continue	×
124
125	# Get all candidates from the classification result
126	all_candidates = result.get_all_candidates()	1✔
127
128	# Build a mapping from constructed element ID to candidate
129	element_id_to_candidate: dict[int, Candidate] = {}	1✔
130	for _label, candidates in all_candidates.items():	1✔
131	for candidate in candidates:	1✔
132	if candidate.constructed is not None:	1✔
133	elem_id = id(candidate.constructed)	1✔
134	assert elem_id not in element_id_to_candidate, (	1✔
135	f"Source block id:"
136	f"{id(candidate.source_blocks[0]) if candidate.source_blocks else 'None'} "
137	f"produced multiple elements of type "
138	f"{candidate.constructed.__class__.__name__} "
139	f"in {fixture_file} page {page_idx}"
140	)
141	element_id_to_candidate[elem_id] = candidate	1✔
142
143	blocks_to_element: dict[int, LegoPageElement] = {}	1✔
144
145	# Traverse all LegoPageElements in the Page tree
146	for element in page.iter_elements():	1✔
147	elem_id = id(element)	1✔
148
149	# Skip synthetic/fallback elements that weren't created by candidates
150	# (e.g., empty PartsLists created when Step has no parts_list)
151	if elem_id not in element_id_to_candidate:	1✔
152	continue	1✔
153
154	candidate = element_id_to_candidate[elem_id]	1✔
155
156	for source_block in candidate.source_blocks:	1✔
157	if source_block.id in blocks_to_element:	1✔
158	existing_element = blocks_to_element[source_block.id]	×
159	assert source_block.id not in blocks_to_element, (	×
160	f"Source block id:{source_block.id} "
161	f"({source_block.tag}) mapped to multiple "
162	f"elements in {fixture_file} page "
163	f"{page_data.page_number}:\n"
164	f" First: {existing_element}\n"
165	f" Second: {element}\n"
166	f" Source: {source_block}"
167	)
168	blocks_to_element[source_block.id] = element	1✔
169
170	@pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)	1✔
171	def test_all_lego_elements_come_from_candidates(self, fixture_file: str) -> None:	1✔
172	"""All LegoPageElements in the final Page tree must come from candidates.
173
174	This validates that classifiers don't create "orphan" elements directly
175	without a corresponding candidate. Every LegoPageElement should be either:
176	1. The constructed element of a candidate, or
177	2. A synthetic/fallback element (e.g., empty PartsList when Step has no
178	parts_list candidate)
179
180	Ensures proper tracking of all elements through the classification pipeline.
181	"""
182	pages = _load_pages_from_fixture(fixture_file)	1✔
183
184	for page_idx, page_data in enumerate(pages):	1✔
185	# Run the full classification pipeline on the page
186	result = classify_elements(page_data)	1✔
187	page = result.page	1✔
188
189	if page is None:	1✔
190	continue	×
191
192	# Build a set of all constructed element IDs from candidates
193	all_candidates = result.get_all_candidates()	1✔
194	constructed_element_ids: set[int] = set()	1✔
195	for _label, candidates in all_candidates.items():	1✔
196	for candidate in candidates:	1✔
197	if candidate.constructed is not None:	1✔
198	constructed_element_ids.add(id(candidate.constructed))	1✔
199
200	# Traverse all LegoPageElements in the Page tree
201	orphan_elements: list[tuple[LegoPageElement, str]] = []	1✔
202	for element in page.iter_elements():	1✔
203	elem_id = id(element)	1✔
204	elem_type = element.__class__.__name__	1✔
205
206	# Skip the Page itself (it's the root container)
207	if elem_type == "Page":	1✔
208	continue	1✔
209
210	# Check if this element came from a candidate
211	if elem_id not in constructed_element_ids:	1✔
212	# TODO Remove the following lines
213	# Known synthetic/fallback elements that are expected:
214	# - Empty PartsList when Step has no parts_list candidate
215	# - Diagram when Step couldn't find a matching diagram candidate
216	if isinstance(element, PartsList) and len(element.parts) == 0:	1✔
217	continue	1✔
218	if isinstance(element, Diagram):	1✔
219	# Fallback diagrams are allowed when StepClassifier
220	# can't find a matching diagram candidate
221	continue	1✔
222
223	orphan_elements.append((element, elem_type))	×
224
225	if orphan_elements:	1✔
226	log.error(	×
227	f"Found {len(orphan_elements)} orphan elements not from "
228	f"candidates in {fixture_file} page {page_idx}:"
229	)
230	for elem, elem_type in orphan_elements:	×
231	log.error(f" - {elem_type} bbox:{elem.bbox}")	×
232
233	assert len(orphan_elements) == 0, (	1✔
234	f"Found {len(orphan_elements)} orphan LegoPageElements not from "
235	f"candidates in {fixture_file} page {page_idx}. "
236	f"All elements should come from candidates or be known fallbacks."
237	)
238
239	@pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)	1✔
240	def test_no_orphaned_constructed_candidates(self, fixture_file: str) -> None:	1✔
241	"""No candidate marked constructed without being in the final tree.
242
243	This validates the transactional rollback semantics of build():
244	- If a parent classifier's build() fails, all sub-candidates it built
245	should be rolled back (constructed = None)
246	- Only candidates that are actually used in the final Page tree should
247	remain marked as constructed
248
249	This catches bugs where:
250	1. A classifier builds sub-candidates (e.g., step builds step_number)
251	2. The classifier then fails (e.g., parts_list build fails)
252	3. The step_number candidate remains orphaned with constructed set,
253	but not actually used in the final tree
254	"""
255	pages = _load_pages_from_fixture(fixture_file)	1✔
256
257	for page_idx, page_data in enumerate(pages):	1✔
258	# Run the full classification pipeline on the page
259	result = classify_elements(page_data)	1✔
260	page = result.page	1✔
261
262	if page is None:	1✔
UNCOV 263	continue	×
264
265	# Build set of all element IDs actually used in the final Page tree
266	used_element_ids: set[int] = set()	1✔
267	for element in page.iter_elements():	1✔
268	used_element_ids.add(id(element))	1✔
269
270	# Check all candidates for orphaned constructed elements
271	all_candidates = result.get_all_candidates()	1✔
272	orphaned_candidates: list[tuple[str, Candidate]] = []	1✔
273
274	for label, candidates in all_candidates.items():	1✔
275	for candidate in candidates:	1✔
276	# If candidate is marked as constructed but not in the tree
277	if (	1✔
278	candidate.constructed is not None
279	and id(candidate.constructed) not in used_element_ids
280	):
UNCOV 281	orphaned_candidates.append((label, candidate))	×
282
283	if orphaned_candidates:	1✔
UNCOV 284	log.error(	×
285	f"Found {len(orphaned_candidates)} orphaned constructed "
286	f"candidates in {fixture_file} page {page_idx}:"
287	)
UNCOV 288	for label, candidate in orphaned_candidates:	×
UNCOV 289	elem_type = candidate.constructed.__class__.__name__	×
UNCOV 290	log.error(	×
291	f" - {label}: {elem_type} bbox:{candidate.bbox} "
292	f"score:{candidate.score:.3f} "
293	f"failure:{candidate.failure_reason}"
294	)
295
296	assert len(orphaned_candidates) == 0, (	1✔
297	f"Found {len(orphaned_candidates)} orphaned constructed candidates "
298	f"in {fixture_file} page {page_idx}. "
299	f"Candidates marked as constructed should either be in the final "
300	f"Page tree or rolled back to constructed=None. "
301	f"This indicates a transactional rollback failure."
302	)

bramp / build-along / 19727090583

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous