19751674072

Committed 28 Nov 2025 01:40AM UTC coverage: 89.023% (-0.8%) from 89.847%

Build # 19751674072

Build Type

push

github

Committed by

bramp

Commit Message

refactor(classifier): reorganize text modules and break circular dependencies

Major Changes:
- Created text/ subdirectory for text-related classifier modules
- Moved text_histogram, text_extractors, font_size_hints to text/
- Created constants.py to resolve circular dependency issue

Module Organization:
- classifier/text/__init__.py: Package exports for text modules
- classifier/text/text_histogram.py: TextHistogram class
- classifier/text/text_extractors.py: Text extraction functions
- classifier/text/font_size_hints.py: FontSizeHints class

Circular Dependency Resolution:
- Created classifier/constants.py with CATALOG_ELEMENT_ID_THRESHOLD
- Removed ClassVar from PageHintCollection
- Updated font_size_hints.py and page_hint_collection.py to import from constants
- Fixed package-level circular import by importing TextHistogram directly from module
- Added TODO to consider moving constant to ClassifierConfig

Bug Fixes:
- Fixed DrawableItem frozen model issue in drawing.py
- Create new instances with depth instead of mutating frozen objects

Import Updates:
- Updated all imports across ~15 files to use new module paths
- Updated classifier/__init__.py to re-export text module classes

Tests:
- All tests passing (42/42 test files)
- Type checking passes
- Code formatted with ruff

Run Details

32 of 32 new or added lines in 23 files covered. (100.0%)

180 existing lines in 19 files now uncovered.

7429 of 8345 relevant lines covered (89.02%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

18.26

/src/build_a_long/pdf_extract/classifier/classifier_rules_test.py

"""Rule-based tests over real fixtures for the PDF element classifier.

This suite validates high-level invariants that must hold after classification.

Rules covered:
- No labeled element should be marked as deleted.
- Each element has at most one winner candidate.

Real fixture(s) live under this package's fixtures/ directory.
"""

import logging

import pytest

from build_a_long.pdf_extract.classifier import Candidate, classify_elements
from build_a_long.pdf_extract.extractor import ExtractionResult, PageData
from build_a_long.pdf_extract.extractor.lego_page_elements import (
    Diagram,
    LegoPageElement,
    PartsList,
)
from build_a_long.pdf_extract.fixtures import FIXTURES_DIR, RAW_FIXTURE_FILES

log = logging.getLogger(__name__)


def _load_pages_from_fixture(fixture_file: str) -> list[PageData]:
    """Load all pages from a fixture file.

    Args:
        fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')

    Returns:
        All pages from the extraction result

    Raises:
        ValueError: If the fixture contains no pages
    """
    fixture_path = FIXTURES_DIR / fixture_file
    extraction: ExtractionResult = ExtractionResult.model_validate_json(
        fixture_path.read_text()
    )  # type: ignore[assignment]

    if not extraction.pages:
        raise ValueError(f"No pages found in {fixture_file}")

    return extraction.pages


class TestClassifierRules:
    """End-to-end rules that must hold on real pages after classification."""

    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
    def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:
        """No element with a label should be marked as deleted.

        If an element has been classified with a label, it should not be deleted.
        This ensures that the classification and deletion logic don't conflict.
        """
        pages: list[PageData] = _load_pages_from_fixture(fixture_file)

        for page_idx, page in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page)

            # Find all elements that are both labeled and deleted
            # Build a map of source_block -> label for successfully constructed
            # candidates
            block_to_label: dict[int, str] = {}
            for label, candidates in result.get_all_candidates().items():
                for candidate in candidates:
                    if candidate.constructed is not None and candidate.source_blocks:
                        for block in candidate.source_blocks:
                            block_to_label[id(block)] = label

            labeled_and_deleted = []
            for elem in page.blocks:
                if id(elem) in block_to_label and result.is_removed(elem):
                    labeled_and_deleted.append((elem, block_to_label[id(elem)]))

            if labeled_and_deleted:
                log.error(
                    f"Found {len(labeled_and_deleted)} labeled elements "
                    f"that are deleted:"
                )
                for elem, label in labeled_and_deleted:
                    log.error(f"  - {label} id:{elem.id} bbox:{elem.bbox} [DELETED]")

            assert len(labeled_and_deleted) == 0, (
                f"Found {len(labeled_and_deleted)} labeled elements that are "
                f"deleted in {fixture_file} page {page_idx}. "
                f"Labeled elements should not be deleted."
            )

    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
    def test_each_source_block_maps_to_one_element(self, fixture_file: str) -> None:
        """Each source block should map to at most one LegoPageElement.

        This validates that the classification pipeline doesn't create duplicate
        elements from the same source block. Each raw extraction block should
        produce at most one classified element in the final Page tree.
        """
        pages = _load_pages_from_fixture(fixture_file)

        for page_idx, page_data in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page_data)
            page = result.page

            if page is None:
                continue

            # Get all candidates from the classification result
            all_candidates = result.get_all_candidates()

            # Build a mapping from constructed element ID to candidate
            element_id_to_candidate: dict[int, Candidate] = {}
            for _label, candidates in all_candidates.items():
                for candidate in candidates:
                    if candidate.constructed is not None:
                        elem_id = id(candidate.constructed)
                        assert elem_id not in element_id_to_candidate, (
                            f"Source block id:"
                            f"{id(candidate.source_blocks[0]) if candidate.source_blocks else 'None'} "
                            f"produced multiple elements of type "
                            f"{candidate.constructed.__class__.__name__} "
                            f"in {fixture_file} page {page_idx}"
                        )
                        element_id_to_candidate[elem_id] = candidate

            blocks_to_element: dict[int, LegoPageElement] = {}

            # Traverse all LegoPageElements in the Page tree
            for element in page.iter_elements():
                elem_id = id(element)

                # Skip synthetic/fallback elements that weren't created by candidates
                # (e.g., empty PartsLists created when Step has no parts_list)
                if elem_id not in element_id_to_candidate:
                    continue

                candidate = element_id_to_candidate[elem_id]

                for source_block in candidate.source_blocks:
                    if source_block.id in blocks_to_element:
                        existing_element = blocks_to_element[source_block.id]
                        assert source_block.id not in blocks_to_element, (
                            f"Source block id:{source_block.id} "
                            f"({source_block.tag}) mapped to multiple "
                            f"elements in {fixture_file} page "
                            f"{page_data.page_number}:\n"
                            f"  First:  {existing_element}\n"
                            f"  Second: {element}\n"
                            f"  Source: {source_block}"
                        )
                    blocks_to_element[source_block.id] = element

    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
    def test_all_lego_elements_come_from_candidates(self, fixture_file: str) -> None:
        """All LegoPageElements in the final Page tree must come from candidates.

        This validates that classifiers don't create "orphan" elements directly
        without a corresponding candidate. Every LegoPageElement should be either:
        1. The constructed element of a candidate, or
        2. A synthetic/fallback element (e.g., empty PartsList when Step has no
           parts_list candidate)

        Ensures proper tracking of all elements through the classification pipeline.
        """
        pages = _load_pages_from_fixture(fixture_file)

        for page_idx, page_data in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page_data)
            page = result.page

            if page is None:
                continue

            # Build a set of all constructed element IDs from candidates
            all_candidates = result.get_all_candidates()
            constructed_element_ids: set[int] = set()
            for _label, candidates in all_candidates.items():
                for candidate in candidates:
                    if candidate.constructed is not None:
                        constructed_element_ids.add(id(candidate.constructed))

            # Traverse all LegoPageElements in the Page tree
            orphan_elements: list[tuple[LegoPageElement, str]] = []
            for element in page.iter_elements():
                elem_id = id(element)
                elem_type = element.__class__.__name__

                # Skip the Page itself (it's the root container)
                if elem_type == "Page":
                    continue

                # Check if this element came from a candidate
                if elem_id not in constructed_element_ids:
                    # TODO Remove the following lines
                    # Known synthetic/fallback elements that are expected:
                    # - Empty PartsList when Step has no parts_list candidate
                    # - Diagram when Step couldn't find a matching diagram candidate
                    if isinstance(element, PartsList) and len(element.parts) == 0:
                        continue
                    if isinstance(element, Diagram):
                        # Fallback diagrams are allowed when StepClassifier
                        # can't find a matching diagram candidate
                        continue

                    orphan_elements.append((element, elem_type))

            if orphan_elements:
                log.error(
                    f"Found {len(orphan_elements)} orphan elements not from "
                    f"candidates in {fixture_file} page {page_idx}:"
                )
                for elem, elem_type in orphan_elements:
                    log.error(f"  - {elem_type} bbox:{elem.bbox}")

            assert len(orphan_elements) == 0, (
                f"Found {len(orphan_elements)} orphan LegoPageElements not from "
                f"candidates in {fixture_file} page {page_idx}. "
                f"All elements should come from candidates or be known fallbacks."
            )

    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
    def test_no_orphaned_constructed_candidates(self, fixture_file: str) -> None:
        """No candidate marked constructed without being in the final tree.

        This validates the transactional rollback semantics of build():
        - If a parent classifier's build() fails, all sub-candidates it built
          should be rolled back (constructed = None)
        - Only candidates that are actually used in the final Page tree should
          remain marked as constructed

        This catches bugs where:
        1. A classifier builds sub-candidates (e.g., step builds step_number)
        2. The classifier then fails (e.g., parts_list build fails)
        3. The step_number candidate remains orphaned with constructed set,
           but not actually used in the final tree
        """
        pages = _load_pages_from_fixture(fixture_file)

        for page_idx, page_data in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page_data)
            page = result.page

            if page is None:
                continue

            # Build set of all element IDs actually used in the final Page tree
            used_element_ids: set[int] = set()
            for element in page.iter_elements():
                used_element_ids.add(id(element))

            # Check all candidates for orphaned constructed elements
            all_candidates = result.get_all_candidates()
            orphaned_candidates: list[tuple[str, Candidate]] = []

            for label, candidates in all_candidates.items():
                for candidate in candidates:
                    # If candidate is marked as constructed but not in the tree
                    if (
                        candidate.constructed is not None
                        and id(candidate.constructed) not in used_element_ids
                    ):
                        orphaned_candidates.append((label, candidate))

            if orphaned_candidates:
                log.error(
                    f"Found {len(orphaned_candidates)} orphaned constructed "
                    f"candidates in {fixture_file} page {page_idx}:"
                )
                for label, candidate in orphaned_candidates:
                    elem_type = candidate.constructed.__class__.__name__
                    log.error(
                        f"  - {label}: {elem_type} bbox:{candidate.bbox} "
                        f"score:{candidate.score:.3f} "
                        f"failure:{candidate.failure_reason}"
                    )

            assert len(orphaned_candidates) == 0, (
                f"Found {len(orphaned_candidates)} orphaned constructed candidates "
                f"in {fixture_file} page {page_idx}. "
                f"Candidates marked as constructed should either be in the final "
                f"Page tree or rolled back to constructed=None. "
                f"This indicates a transactional rollback failure."
            )

1	"""Rule-based tests over real fixtures for the PDF element classifier.
2
3	This suite validates high-level invariants that must hold after classification.
4
5	Rules covered:
6	- No labeled element should be marked as deleted.
7	- Each element has at most one winner candidate.
8
9	Real fixture(s) live under this package's fixtures/ directory.
10	"""
11
12	import logging	1✔
13
14	import pytest	1✔
15
16	from build_a_long.pdf_extract.classifier import Candidate, classify_elements	1✔
17	from build_a_long.pdf_extract.extractor import ExtractionResult, PageData	1✔
18	from build_a_long.pdf_extract.extractor.lego_page_elements import (	1✔
19	Diagram,
20	LegoPageElement,
21	PartsList,
22	)
23	from build_a_long.pdf_extract.fixtures import FIXTURES_DIR, RAW_FIXTURE_FILES	1✔
24
25	log = logging.getLogger(__name__)	1✔
26
27
28	def _load_pages_from_fixture(fixture_file: str) -> list[PageData]:	1✔
29	"""Load all pages from a fixture file.
30
31	Args:
32	fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')
33
34	Returns:
35	All pages from the extraction result
36
37	Raises:
38	ValueError: If the fixture contains no pages
39	"""
40	fixture_path = FIXTURES_DIR / fixture_file	1✔
41	extraction: ExtractionResult = ExtractionResult.model_validate_json(	1✔
42	fixture_path.read_text()
43	) # type: ignore[assignment]
44
45	if not extraction.pages:	1✔
UNCOV 46	raise ValueError(f"No pages found in {fixture_file}")	×
47
48	return extraction.pages	1✔
49
50
51	class TestClassifierRules:	1✔
52	"""End-to-end rules that must hold on real pages after classification."""
53
54	@pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)	1✔
55	def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:	1✔
56	"""No element with a label should be marked as deleted.
57
58	If an element has been classified with a label, it should not be deleted.
59	This ensures that the classification and deletion logic don't conflict.
60	"""
UNCOV 61	pages: list[PageData] = _load_pages_from_fixture(fixture_file)	×
62
UNCOV 63	for page_idx, page in enumerate(pages):	×
64	# Run the full classification pipeline on the page
UNCOV 65	result = classify_elements(page)	×
66
67	# Find all elements that are both labeled and deleted
68	# Build a map of source_block -> label for successfully constructed
69	# candidates
UNCOV 70	block_to_label: dict[int, str] = {}	×
UNCOV 71	for label, candidates in result.get_all_candidates().items():	×
UNCOV 72	for candidate in candidates:	×
UNCOV 73	if candidate.constructed is not None and candidate.source_blocks:	×
UNCOV 74	for block in candidate.source_blocks:	×
UNCOV 75	block_to_label[id(block)] = label	×
76
UNCOV 77	labeled_and_deleted = []	×
UNCOV 78	for elem in page.blocks:	×
UNCOV 79	if id(elem) in block_to_label and result.is_removed(elem):	×
UNCOV 80	labeled_and_deleted.append((elem, block_to_label[id(elem)]))	×
81
UNCOV 82	if labeled_and_deleted:	×
UNCOV 83	log.error(	×
84	f"Found {len(labeled_and_deleted)} labeled elements "
85	f"that are deleted:"
86	)
UNCOV 87	for elem, label in labeled_and_deleted:	×
88	log.error(f" - {label} id:{elem.id} bbox:{elem.bbox} [DELETED]")	×
89
UNCOV 90	assert len(labeled_and_deleted) == 0, (	×
91	f"Found {len(labeled_and_deleted)} labeled elements that are "
92	f"deleted in {fixture_file} page {page_idx}. "
93	f"Labeled elements should not be deleted."
94	)
95
96	@pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)	1✔
97	def test_each_source_block_maps_to_one_element(self, fixture_file: str) -> None:	1✔
98	"""Each source block should map to at most one LegoPageElement.
99
100	This validates that the classification pipeline doesn't create duplicate
101	elements from the same source block. Each raw extraction block should
102	produce at most one classified element in the final Page tree.
103	"""
UNCOV 104	pages = _load_pages_from_fixture(fixture_file)	×
105
UNCOV 106	for page_idx, page_data in enumerate(pages):	×
107	# Run the full classification pipeline on the page
UNCOV 108	result = classify_elements(page_data)	×
UNCOV 109	page = result.page	×
110
UNCOV 111	if page is None:	×
UNCOV 112	continue	×
113
114	# Get all candidates from the classification result
UNCOV 115	all_candidates = result.get_all_candidates()	×
116
117	# Build a mapping from constructed element ID to candidate
UNCOV 118	element_id_to_candidate: dict[int, Candidate] = {}	×
UNCOV 119	for _label, candidates in all_candidates.items():	×
UNCOV 120	for candidate in candidates:	×
UNCOV 121	if candidate.constructed is not None:	×
UNCOV 122	elem_id = id(candidate.constructed)	×
UNCOV 123	assert elem_id not in element_id_to_candidate, (	×
124	f"Source block id:"
125	f"{id(candidate.source_blocks[0]) if candidate.source_blocks else 'None'} "
126	f"produced multiple elements of type "
127	f"{candidate.constructed.__class__.__name__} "
128	f"in {fixture_file} page {page_idx}"
129	)
UNCOV 130	element_id_to_candidate[elem_id] = candidate	×
131
UNCOV 132	blocks_to_element: dict[int, LegoPageElement] = {}	×
133
134	# Traverse all LegoPageElements in the Page tree
UNCOV 135	for element in page.iter_elements():	×
UNCOV 136	elem_id = id(element)	×
137
138	# Skip synthetic/fallback elements that weren't created by candidates
139	# (e.g., empty PartsLists created when Step has no parts_list)
UNCOV 140	if elem_id not in element_id_to_candidate:	×
UNCOV 141	continue	×
142
UNCOV 143	candidate = element_id_to_candidate[elem_id]	×
144
UNCOV 145	for source_block in candidate.source_blocks:	×
UNCOV 146	if source_block.id in blocks_to_element:	×
UNCOV 147	existing_element = blocks_to_element[source_block.id]	×
148	assert source_block.id not in blocks_to_element, (	×
149	f"Source block id:{source_block.id} "
150	f"({source_block.tag}) mapped to multiple "
151	f"elements in {fixture_file} page "
152	f"{page_data.page_number}:\n"
153	f" First: {existing_element}\n"
154	f" Second: {element}\n"
155	f" Source: {source_block}"
156	)
UNCOV 157	blocks_to_element[source_block.id] = element	×
158
159	@pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)	1✔
160	def test_all_lego_elements_come_from_candidates(self, fixture_file: str) -> None:	1✔
161	"""All LegoPageElements in the final Page tree must come from candidates.
162
163	This validates that classifiers don't create "orphan" elements directly
164	without a corresponding candidate. Every LegoPageElement should be either:
165	1. The constructed element of a candidate, or
166	2. A synthetic/fallback element (e.g., empty PartsList when Step has no
167	parts_list candidate)
168
169	Ensures proper tracking of all elements through the classification pipeline.
170	"""
UNCOV 171	pages = _load_pages_from_fixture(fixture_file)	×
172
UNCOV 173	for page_idx, page_data in enumerate(pages):	×
174	# Run the full classification pipeline on the page
UNCOV 175	result = classify_elements(page_data)	×
UNCOV 176	page = result.page	×
177
UNCOV 178	if page is None:	×
UNCOV 179	continue	×
180
181	# Build a set of all constructed element IDs from candidates
UNCOV 182	all_candidates = result.get_all_candidates()	×
UNCOV 183	constructed_element_ids: set[int] = set()	×
UNCOV 184	for _label, candidates in all_candidates.items():	×
UNCOV 185	for candidate in candidates:	×
UNCOV 186	if candidate.constructed is not None:	×
UNCOV 187	constructed_element_ids.add(id(candidate.constructed))	×
188
189	# Traverse all LegoPageElements in the Page tree
UNCOV 190	orphan_elements: list[tuple[LegoPageElement, str]] = []	×
UNCOV 191	for element in page.iter_elements():	×
UNCOV 192	elem_id = id(element)	×
UNCOV 193	elem_type = element.__class__.__name__	×
194
195	# Skip the Page itself (it's the root container)
UNCOV 196	if elem_type == "Page":	×
UNCOV 197	continue	×
198
199	# Check if this element came from a candidate
UNCOV 200	if elem_id not in constructed_element_ids:	×
201	# TODO Remove the following lines
202	# Known synthetic/fallback elements that are expected:
203	# - Empty PartsList when Step has no parts_list candidate
204	# - Diagram when Step couldn't find a matching diagram candidate
UNCOV 205	if isinstance(element, PartsList) and len(element.parts) == 0:	×
UNCOV 206	continue	×
UNCOV 207	if isinstance(element, Diagram):	×
208	# Fallback diagrams are allowed when StepClassifier
209	# can't find a matching diagram candidate
UNCOV 210	continue	×
211
UNCOV 212	orphan_elements.append((element, elem_type))	×
213
UNCOV 214	if orphan_elements:	×
UNCOV 215	log.error(	×
216	f"Found {len(orphan_elements)} orphan elements not from "
217	f"candidates in {fixture_file} page {page_idx}:"
218	)
UNCOV 219	for elem, elem_type in orphan_elements:	×
220	log.error(f" - {elem_type} bbox:{elem.bbox}")	×
221
UNCOV 222	assert len(orphan_elements) == 0, (	×
223	f"Found {len(orphan_elements)} orphan LegoPageElements not from "
224	f"candidates in {fixture_file} page {page_idx}. "
225	f"All elements should come from candidates or be known fallbacks."
226	)
227
228	@pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)	1✔
229	def test_no_orphaned_constructed_candidates(self, fixture_file: str) -> None:	1✔
230	"""No candidate marked constructed without being in the final tree.
231
232	This validates the transactional rollback semantics of build():
233	- If a parent classifier's build() fails, all sub-candidates it built
234	should be rolled back (constructed = None)
235	- Only candidates that are actually used in the final Page tree should
236	remain marked as constructed
237
238	This catches bugs where:
239	1. A classifier builds sub-candidates (e.g., step builds step_number)
240	2. The classifier then fails (e.g., parts_list build fails)
241	3. The step_number candidate remains orphaned with constructed set,
242	but not actually used in the final tree
243	"""
UNCOV 244	pages = _load_pages_from_fixture(fixture_file)	×
245
UNCOV 246	for page_idx, page_data in enumerate(pages):	×
247	# Run the full classification pipeline on the page
UNCOV 248	result = classify_elements(page_data)	×
UNCOV 249	page = result.page	×
250
UNCOV 251	if page is None:	×
UNCOV 252	continue	×
253
254	# Build set of all element IDs actually used in the final Page tree
UNCOV 255	used_element_ids: set[int] = set()	×
UNCOV 256	for element in page.iter_elements():	×
UNCOV 257	used_element_ids.add(id(element))	×
258
259	# Check all candidates for orphaned constructed elements
UNCOV 260	all_candidates = result.get_all_candidates()	×
UNCOV 261	orphaned_candidates: list[tuple[str, Candidate]] = []	×
262
UNCOV 263	for label, candidates in all_candidates.items():	×
UNCOV 264	for candidate in candidates:	×
265	# If candidate is marked as constructed but not in the tree
UNCOV 266	if (	×
267	candidate.constructed is not None
268	and id(candidate.constructed) not in used_element_ids
269	):
UNCOV 270	orphaned_candidates.append((label, candidate))	×
271
UNCOV 272	if orphaned_candidates:	×
UNCOV 273	log.error(	×
274	f"Found {len(orphaned_candidates)} orphaned constructed "
275	f"candidates in {fixture_file} page {page_idx}:"
276	)
UNCOV 277	for label, candidate in orphaned_candidates:	×
278	elem_type = candidate.constructed.__class__.__name__	×
279	log.error(	×
280	f" - {label}: {elem_type} bbox:{candidate.bbox} "
281	f"score:{candidate.score:.3f} "
282	f"failure:{candidate.failure_reason}"
283	)
284
UNCOV 285	assert len(orphaned_candidates) == 0, (	×
286	f"Found {len(orphaned_candidates)} orphaned constructed candidates "
287	f"in {fixture_file} page {page_idx}. "
288	f"Candidates marked as constructed should either be in the final "
289	f"Page tree or rolled back to constructed=None. "
290	f"This indicates a transactional rollback failure."
291	)

bramp / build-along / 19751674072

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous