19615175570

Committed 23 Nov 2025 06:03PM UTC coverage: 91.259% (-0.1%) from 91.376%

Build # 19615175570

Build Type

push

github

Committed by

bramp

Commit Message

fix: Minor improvement to the classifier_rules.

Run Details

1 of 3 new or added lines in 1 file covered. (33.33%)

210 existing lines in 23 files now uncovered.

6160 of 6750 relevant lines covered (91.26%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

86.89

/src/build_a_long/pdf_extract/classifier/classifier_rules_test.py

"""Rule-based tests over real fixtures for the PDF element classifier.

This suite validates high-level invariants that must hold after classification.

Rules covered:
- No labeled element should be marked as deleted.
- Each element has at most one winner candidate.

Real fixture(s) live under this package's fixtures/ directory.
"""

import logging

import pytest

from build_a_long.pdf_extract.classifier import classify_elements
from build_a_long.pdf_extract.classifier.classification_result import Candidate
from build_a_long.pdf_extract.extractor import ExtractionResult, PageData
from build_a_long.pdf_extract.extractor.lego_page_elements import LegoPageElement
from build_a_long.pdf_extract.fixtures import FIXTURES_DIR, RAW_FIXTURE_FILES

log = logging.getLogger(__name__)


def _load_pages_from_fixture(fixture_file: str) -> list[PageData]:
    """Load all pages from a fixture file.

    Args:
        fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')

    Returns:
        All pages from the extraction result

    Raises:
        ValueError: If the fixture contains no pages
    """
    fixture_path = FIXTURES_DIR / fixture_file
    extraction: ExtractionResult = ExtractionResult.model_validate_json(
        fixture_path.read_text()
    )  # type: ignore[assignment]

    if not extraction.pages:
        raise ValueError(f"No pages found in {fixture_file}")

    return extraction.pages


class TestClassifierRules:
    """End-to-end rules that must hold on real pages after classification."""

    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
    def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:
        """No element with a label should be marked as deleted.

        If an element has been classified with a label, it should not be deleted.
        This ensures that the classification and deletion logic don't conflict.
        """
        pages: list[PageData] = _load_pages_from_fixture(fixture_file)

        for page_idx, page in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page)

            # Find all elements that are both labeled and deleted
            # Build a map of source_block -> label for successfully constructed candidates
            block_to_label: dict[int, str] = {}
            for label, candidates in result.get_all_candidates().items():
                for candidate in candidates:
                    if (
                        candidate.constructed is not None
                        and candidate.source_block is not None
                    ):
                        block_to_label[id(candidate.source_block)] = label

            labeled_and_deleted = []
            for elem in page.blocks:
                if id(elem) in block_to_label and result.is_removed(elem):
                    labeled_and_deleted.append((elem, block_to_label[id(elem)]))

            if labeled_and_deleted:
                log.error(
                    f"Found {len(labeled_and_deleted)} labeled elements "
                    f"that are deleted:"
                )
                for elem, label in labeled_and_deleted:
                    log.error(f"  - {label} id:{elem.id} bbox:{elem.bbox} [DELETED]")

            assert len(labeled_and_deleted) == 0, (
                f"Found {len(labeled_and_deleted)} labeled elements that are "
                f"deleted in {fixture_file} page {page_idx}. "
                f"Labeled elements should not be deleted."
            )

    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
    def test_each_source_block_maps_to_one_element(self, fixture_file: str) -> None:
        """Each source block should map to at most one LegoPageElement.

        This validates that the classification pipeline doesn't create duplicate
        elements from the same source block. Each raw extraction block should
        produce at most one classified element in the final Page tree.
        """
        pages = _load_pages_from_fixture(fixture_file)

        for page_idx, page_data in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page_data)
            page = result.page

            if page is None:
                continue

            # Get all candidates from the classification result
            all_candidates = result.get_all_candidates()

            # Build a mapping from constructed element ID to candidate
            element_id_to_candidate: dict[int, Candidate] = {}
            for _label, candidates in all_candidates.items():
                for candidate in candidates:
                    if candidate.constructed is not None:
                        elem_id = id(candidate.constructed)
                        assert elem_id not in element_id_to_candidate, (
                            f"Source block id:{id(candidate.source_block)} "
                            f"produced multiple elements of type "
                            f"{candidate.constructed.__class__.__name__} "
                            f"in {fixture_file} page {page_idx}"
                        )
                        element_id_to_candidate[elem_id] = candidate

            blocks_to_element: dict[int, LegoPageElement] = {}

            # Traverse all LegoPageElements in the Page tree
            for element in page.iter_elements():
                elem_id = id(element)

                # Skip synthetic/fallback elements that weren't created by candidates
                # (e.g., empty PartsLists created when Step has no parts_list)
                if elem_id not in element_id_to_candidate:
                    continue

                candidate = element_id_to_candidate[elem_id]

                if candidate.source_block:
                    if candidate.source_block.id in blocks_to_element:
                        existing_element = blocks_to_element[candidate.source_block.id]
                        assert candidate.source_block.id not in blocks_to_element, (
                            f"Source block id:{candidate.source_block.id} "
                            f"({candidate.source_block.tag}) mapped to multiple "
                            f"elements in {fixture_file} page {page_data.page_number}:\n"
                            f"  First:  {existing_element}\n"
                            f"  Second: {element}\n"
                            f"  Source: {candidate.source_block}"
                        )
                    blocks_to_element[candidate.source_block.id] = element

1	"""Rule-based tests over real fixtures for the PDF element classifier.
2
3	This suite validates high-level invariants that must hold after classification.
4
5	Rules covered:
6	- No labeled element should be marked as deleted.
7	- Each element has at most one winner candidate.
8
9	Real fixture(s) live under this package's fixtures/ directory.
10	"""
11
12	import logging	1✔
13
14	import pytest	1✔
15
16	from build_a_long.pdf_extract.classifier import classify_elements	1✔
17	from build_a_long.pdf_extract.classifier.classification_result import Candidate	1✔
18	from build_a_long.pdf_extract.extractor import ExtractionResult, PageData	1✔
19	from build_a_long.pdf_extract.extractor.lego_page_elements import LegoPageElement	1✔
20	from build_a_long.pdf_extract.fixtures import FIXTURES_DIR, RAW_FIXTURE_FILES	1✔
21
22	log = logging.getLogger(__name__)	1✔
23
24
25	def _load_pages_from_fixture(fixture_file: str) -> list[PageData]:	1✔
26	"""Load all pages from a fixture file.
27
28	Args:
29	fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')
30
31	Returns:
32	All pages from the extraction result
33
34	Raises:
35	ValueError: If the fixture contains no pages
36	"""
37	fixture_path = FIXTURES_DIR / fixture_file	1✔
38	extraction: ExtractionResult = ExtractionResult.model_validate_json(	1✔
39	fixture_path.read_text()
40	) # type: ignore[assignment]
41
42	if not extraction.pages:	1✔
UNCOV 43	raise ValueError(f"No pages found in {fixture_file}")	×
44
45	return extraction.pages	1✔
46
47
48	class TestClassifierRules:	1✔
49	"""End-to-end rules that must hold on real pages after classification."""
50
51	@pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)	1✔
52	def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:	1✔
53	"""No element with a label should be marked as deleted.
54
55	If an element has been classified with a label, it should not be deleted.
56	This ensures that the classification and deletion logic don't conflict.
57	"""
58	pages: list[PageData] = _load_pages_from_fixture(fixture_file)	1✔
59
60	for page_idx, page in enumerate(pages):	1✔
61	# Run the full classification pipeline on the page
62	result = classify_elements(page)	1✔
63
64	# Find all elements that are both labeled and deleted
65	# Build a map of source_block -> label for successfully constructed candidates
66	block_to_label: dict[int, str] = {}	1✔
67	for label, candidates in result.get_all_candidates().items():	1✔
68	for candidate in candidates:	1✔
69	if (	1✔
70	candidate.constructed is not None
71	and candidate.source_block is not None
72	):
73	block_to_label[id(candidate.source_block)] = label	1✔
74
75	labeled_and_deleted = []	1✔
76	for elem in page.blocks:	1✔
77	if id(elem) in block_to_label and result.is_removed(elem):	1✔
UNCOV 78	labeled_and_deleted.append((elem, block_to_label[id(elem)]))	×
79
80	if labeled_and_deleted:	1✔
UNCOV 81	log.error(	×
82	f"Found {len(labeled_and_deleted)} labeled elements "
83	f"that are deleted:"
84	)
UNCOV 85	for elem, label in labeled_and_deleted:	×
UNCOV 86	log.error(f" - {label} id:{elem.id} bbox:{elem.bbox} [DELETED]")	×
87
88	assert len(labeled_and_deleted) == 0, (	1✔
89	f"Found {len(labeled_and_deleted)} labeled elements that are "
90	f"deleted in {fixture_file} page {page_idx}. "
91	f"Labeled elements should not be deleted."
92	)
93
94	@pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)	1✔
95	def test_each_source_block_maps_to_one_element(self, fixture_file: str) -> None:	1✔
96	"""Each source block should map to at most one LegoPageElement.
97
98	This validates that the classification pipeline doesn't create duplicate
99	elements from the same source block. Each raw extraction block should
100	produce at most one classified element in the final Page tree.
101	"""
102	pages = _load_pages_from_fixture(fixture_file)	1✔
103
104	for page_idx, page_data in enumerate(pages):	1✔
105	# Run the full classification pipeline on the page
106	result = classify_elements(page_data)	1✔
107	page = result.page	1✔
108
109	if page is None:	1✔
UNCOV 110	continue	×
111
112	# Get all candidates from the classification result
113	all_candidates = result.get_all_candidates()	1✔
114
115	# Build a mapping from constructed element ID to candidate
116	element_id_to_candidate: dict[int, Candidate] = {}	1✔
117	for _label, candidates in all_candidates.items():	1✔
118	for candidate in candidates:	1✔
119	if candidate.constructed is not None:	1✔
120	elem_id = id(candidate.constructed)	1✔
121	assert elem_id not in element_id_to_candidate, (	1✔
122	f"Source block id:{id(candidate.source_block)} "
123	f"produced multiple elements of type "
124	f"{candidate.constructed.__class__.__name__} "
125	f"in {fixture_file} page {page_idx}"
126	)
127	element_id_to_candidate[elem_id] = candidate	1✔
128
129	blocks_to_element: dict[int, LegoPageElement] = {}	1✔
130
131	# Traverse all LegoPageElements in the Page tree
132	for element in page.iter_elements():	1✔
133	elem_id = id(element)	1✔
134
135	# Skip synthetic/fallback elements that weren't created by candidates
136	# (e.g., empty PartsLists created when Step has no parts_list)
137	if elem_id not in element_id_to_candidate:	1✔
138	continue	1✔
139
140	candidate = element_id_to_candidate[elem_id]	1✔
141
142	if candidate.source_block:	1✔
143	if candidate.source_block.id in blocks_to_element:	1✔
NEW 144	existing_element = blocks_to_element[candidate.source_block.id]	×
NEW 145	assert candidate.source_block.id not in blocks_to_element, (	×
146	f"Source block id:{candidate.source_block.id} "
147	f"({candidate.source_block.tag}) mapped to multiple "
148	f"elements in {fixture_file} page {page_data.page_number}:\n"
149	f" First: {existing_element}\n"
150	f" Second: {element}\n"
151	f" Source: {candidate.source_block}"
152	)
153	blocks_to_element[candidate.source_block.id] = element	1✔

bramp / build-along / 19615175570

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous