19521244091

Committed 15 Nov 2025 02:10AM UTC coverage: 90.833% (-0.4%) from 91.217%

Build # 19521244091

Build Type

push

github

Committed by

bramp

Commit Message

refactor: remove unused code and simplify domain invariant tests

Removed approximately 220 lines of unused/redundant code from classifier tests:

classifier_rules_test.py:
- Removed ClassifiedPage wrapper class (~110 lines) - never instantiated
- Removed helper functions (_parts_lists, _part_images, _part_counts,
  _print_label_counts) - never called
- Cleaned up unused imports (defaultdict, Block, ClassificationResult)
- Updated docstring to reflect remaining test coverage

domain_invariants_test.py:
- Simplified all 4 tests to use result.page property directly
- Replaced verbose 6-line get_candidates() pattern with simple property access
- Removed redundant isinstance(page, Page) assertions (~48 lines total)
- Tests now more clearly express intent: validate Page/PartsList/Part objects

All tests continue to pass. No functionality was lost.

Run Details

4 of 4 new or added lines in 2 files covered. (100.0%)

151 existing lines in 7 files now uncovered.

4994 of 5498 relevant lines covered (90.83%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

84.44

/src/build_a_long/pdf_extract/classifier/classifier_rules_test.py

"""Rule-based tests over real fixtures for the PDF element classifier.

This suite validates high-level invariants that must hold after classification.

Rules covered:
- No labeled element should be marked as deleted.
- Each element has at most one winner candidate.

Real fixture(s) live under this package's fixtures/ directory.
"""

import logging

import pytest

from build_a_long.pdf_extract.classifier import classify_elements
from build_a_long.pdf_extract.extractor import ExtractionResult, PageData
from build_a_long.pdf_extract.fixtures import FIXTURES_DIR, RAW_FIXTURE_FILES

log = logging.getLogger(__name__)


def _load_pages_from_fixture(fixture_file: str) -> list[PageData]:
    """Load all pages from a fixture file.

    Args:
        fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')

    Returns:
        All pages from the extraction result

    Raises:
        ValueError: If the fixture contains no pages
    """
    fixture_path = FIXTURES_DIR / fixture_file
    extraction: ExtractionResult = ExtractionResult.model_validate_json(
        fixture_path.read_text()
    )  # type: ignore[assignment]

    if not extraction.pages:
        raise ValueError(f"No pages found in {fixture_file}")

    return extraction.pages


class TestClassifierRules:
    """End-to-end rules that must hold on real pages after classification."""

    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
    def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:
        """No element with a label should be marked as deleted.

        If an element has been classified with a label, it should not be deleted.
        This ensures that the classification and deletion logic don't conflict.
        """
        pages = _load_pages_from_fixture(fixture_file)

        for page_idx, page in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page)

            # Find all elements that are both labeled and deleted
            labeled_and_deleted = []
            for elem in page.blocks:
                if result.get_label(elem) is not None and result.is_removed(elem):
                    labeled_and_deleted.append(elem)

            if labeled_and_deleted:
                log.error(
                    f"Found {len(labeled_and_deleted)} labeled elements "
                    f"that are deleted:"
                )
                for elem in labeled_and_deleted:
                    log.error(
                        f"  - {result.get_label(elem)} id:{elem.id} "
                        f"bbox:{elem.bbox} [DELETED]"
                    )

            assert len(labeled_and_deleted) == 0, (
                f"Found {len(labeled_and_deleted)} labeled elements that are "
                f"deleted in {fixture_file} page {page_idx}. "
                f"Labeled elements should not be deleted."
            )

    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
    def test_each_element_has_at_most_one_winner(self, fixture_file: str) -> None:
        """Each element should have at most one winner candidate across all labels.

        An element can have multiple candidates across different labels, but only
        one of them should be marked as a winner. This ensures classification
        decisions are unambiguous.
        """
        pages = _load_pages_from_fixture(fixture_file)

        for page_idx, page in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page)

            # Track which blocks have won, and for which label
            block_to_winning_label: dict[int, str] = {}

            # Check all candidates across all labels
            all_candidates = result.get_all_candidates()
            for label, candidates in all_candidates.items():
                for candidate in candidates:
                    if not candidate.is_winner:
                        continue

                    # Skip synthetic candidates (no source block)
                    if candidate.source_block is None:
                        continue

                    block_id = candidate.source_block.id

                    # Check if this block already has a winner
                    if block_id in block_to_winning_label:
                        existing_label = block_to_winning_label[block_id]
                        pytest.fail(
                            f"Block {block_id} in {fixture_file} page {page_idx} "
                            f"has multiple winner candidates: '{existing_label}' "
                            f"and '{label}'. Each block should have at most one winner."
                        )

                    block_to_winning_label[block_id] = label

1	"""Rule-based tests over real fixtures for the PDF element classifier.
2
3	This suite validates high-level invariants that must hold after classification.
4
5	Rules covered:
6	- No labeled element should be marked as deleted.
7	- Each element has at most one winner candidate.
8
9	Real fixture(s) live under this package's fixtures/ directory.
10	"""
11
12	import logging	1✔
13
14	import pytest	1✔
15
16	from build_a_long.pdf_extract.classifier import classify_elements	1✔
17	from build_a_long.pdf_extract.extractor import ExtractionResult, PageData	1✔
18	from build_a_long.pdf_extract.fixtures import FIXTURES_DIR, RAW_FIXTURE_FILES	1✔
19
20	log = logging.getLogger(__name__)	1✔
21
22
23	def _load_pages_from_fixture(fixture_file: str) -> list[PageData]:	1✔
24	"""Load all pages from a fixture file.
25
26	Args:
27	fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')
28
29	Returns:
30	All pages from the extraction result
31
32	Raises:
33	ValueError: If the fixture contains no pages
34	"""
35	fixture_path = FIXTURES_DIR / fixture_file	1✔
36	extraction: ExtractionResult = ExtractionResult.model_validate_json(	1✔
37	fixture_path.read_text()
38	) # type: ignore[assignment]
39
40	if not extraction.pages:	1✔
UNCOV 41	raise ValueError(f"No pages found in {fixture_file}")	×
42
43	return extraction.pages	1✔
44
45
46	class TestClassifierRules:	1✔
47	"""End-to-end rules that must hold on real pages after classification."""
48
49	@pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)	1✔
50	def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:	1✔
51	"""No element with a label should be marked as deleted.
52
53	If an element has been classified with a label, it should not be deleted.
54	This ensures that the classification and deletion logic don't conflict.
55	"""
56	pages = _load_pages_from_fixture(fixture_file)	1✔
57
58	for page_idx, page in enumerate(pages):	1✔
59	# Run the full classification pipeline on the page
60	result = classify_elements(page)	1✔
61
62	# Find all elements that are both labeled and deleted
63	labeled_and_deleted = []	1✔
64	for elem in page.blocks:	1✔
65	if result.get_label(elem) is not None and result.is_removed(elem):	1✔
UNCOV 66	labeled_and_deleted.append(elem)	×
67
68	if labeled_and_deleted:	1✔
UNCOV 69	log.error(	×
70	f"Found {len(labeled_and_deleted)} labeled elements "
71	f"that are deleted:"
72	)
UNCOV 73	for elem in labeled_and_deleted:	×
UNCOV 74	log.error(	×
75	f" - {result.get_label(elem)} id:{elem.id} "
76	f"bbox:{elem.bbox} [DELETED]"
77	)
78
79	assert len(labeled_and_deleted) == 0, (	1✔
80	f"Found {len(labeled_and_deleted)} labeled elements that are "
81	f"deleted in {fixture_file} page {page_idx}. "
82	f"Labeled elements should not be deleted."
83	)
84
85	@pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)	1✔
86	def test_each_element_has_at_most_one_winner(self, fixture_file: str) -> None:	1✔
87	"""Each element should have at most one winner candidate across all labels.
88
89	An element can have multiple candidates across different labels, but only
90	one of them should be marked as a winner. This ensures classification
91	decisions are unambiguous.
92	"""
93	pages = _load_pages_from_fixture(fixture_file)	1✔
94
95	for page_idx, page in enumerate(pages):	1✔
96	# Run the full classification pipeline on the page
97	result = classify_elements(page)	1✔
98
99	# Track which blocks have won, and for which label
100	block_to_winning_label: dict[int, str] = {}	1✔
101
102	# Check all candidates across all labels
103	all_candidates = result.get_all_candidates()	1✔
104	for label, candidates in all_candidates.items():	1✔
105	for candidate in candidates:	1✔
106	if not candidate.is_winner:	1✔
107	continue	1✔
108
109	# Skip synthetic candidates (no source block)
110	if candidate.source_block is None:	1✔
111	continue	1✔
112
113	block_id = candidate.source_block.id	1✔
114
115	# Check if this block already has a winner
116	if block_id in block_to_winning_label:	1✔
UNCOV 117	existing_label = block_to_winning_label[block_id]	×
UNCOV 118	pytest.fail(	×
119	f"Block {block_id} in {fixture_file} page {page_idx} "
120	f"has multiple winner candidates: '{existing_label}' "
121	f"and '{label}'. Each block should have at most one winner."
122	)
123
124	block_to_winning_label[block_id] = label	1✔

bramp / build-along / 19521244091

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous