19251794703

Committed 11 Nov 2025 01:25AM UTC coverage: 90.748% (+3.9%) from 86.822%

Build # 19251794703

Build Type

push

github

Committed by

bramp

Commit Message

Update golden files to reflect improved parts list classification

The parts_list_max_area_ratio filter now correctly rejects full-page
drawings (bbox: 0,0 to 552.76,496.06) that were previously incorrectly
classified as parts lists.

Updated golden files:
- 6509377_page_015_expected.json: Full-page drawing rejected, now uses
  actual parts list with proper bbox
- 6509377_page_180_expected.json: Full-page drawing rejected, now uses
  actual parts list with proper bbox

These changes reflect the correct behavior where drawings occupying
>75% of the page area are rejected as likely background elements.

Run Details

4708 of 5188 relevant lines covered (90.75%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

81.48

/src/build_a_long/pdf_extract/classifier/classifier_rules_test.py

"""Rule-based tests over real fixtures for the PDF element classifier.

This suite validates high-level invariants that must hold after classification.

Rules covered:
- Every parts list must contain at least one part image inside it.
- No two parts lists overlap.
- Each part image is inside a parts list.
- Each element has at most one winner candidate.

Real fixture(s) live under this package's fixtures/ directory.
"""

import logging
from collections import defaultdict
from pathlib import Path

import pytest

from build_a_long.pdf_extract.classifier import ClassificationResult, classify_elements
from build_a_long.pdf_extract.extractor import ExtractionResult, PageData
from build_a_long.pdf_extract.extractor.page_blocks import Block, Text

log = logging.getLogger(__name__)


def _load_pages_from_fixture(fixture_file: str) -> list[PageData]:
    """Load all pages from a fixture file.

    Args:
        fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')

    Returns:
        All pages from the extraction result

    Raises:
        ValueError: If the fixture contains no pages
    """
    fixture_path = Path(__file__).parent.parent / "fixtures" / fixture_file
    extraction: ExtractionResult = ExtractionResult.model_validate_json(
        fixture_path.read_text()
    )  # type: ignore[assignment]

    if not extraction.pages:
        raise ValueError(f"No pages found in {fixture_file}")

    return extraction.pages


# TODO A lot of the methods in ClassifiedPage overlap with ClassificationResult


class ClassifiedPage:
    """Wrapper around PageData providing convenient access to classified elements.

    This class provides helper methods to query elements by label type and
    supports hierarchical queries (e.g., finding children inside parent bboxes).
    Results are cached for efficiency.
    """

    def __init__(self, page: PageData, result: ClassificationResult):
        """Initialize with a classified PageData and its result.

        Args:
            page: PageData that has been run through classify_elements()
            result: The ClassificationResult for this page
        """
        self.page = page
        self.result = result
        self._cache: dict[str, list[Block]] = {}

    def elements_by_label(
        self, label: str, include_deleted: bool = False
    ) -> list[Block]:
        """Get all elements with the given label.

        Args:
            label: The label to filter by
            include_deleted: Whether to include deleted elements

        Returns:
            List of elements with matching label
        """
        cache_key = f"{label}:deleted={include_deleted}"
        if cache_key not in self._cache:
            if include_deleted:
                self._cache[cache_key] = [
                    e for e in self.page.blocks if self.result.get_label(e) == label
                ]
            else:
                self._cache[cache_key] = [
                    e
                    for e in self.page.blocks
                    if self.result.get_label(e) == label
                    and not self.result.is_removed(e)
                ]
        return self._cache[cache_key]

    def parts_lists(self) -> list[Block]:
        """Get all non-deleted parts_list elements."""
        return self.elements_by_label("parts_list")

    def part_images(self) -> list[Block]:
        """Get all non-deleted part_image elements."""
        return self.elements_by_label("part_image")

    def part_counts(self) -> list[Block]:
        """Get all non-deleted part_count elements."""
        return self.elements_by_label("part_count")

    def step_numbers(self) -> list[Block]:
        """Get all non-deleted step_number elements."""
        return self.elements_by_label("step_number")

    def children_of(self, parent: Block, label: str | None = None) -> list[Block]:
        """Return all non-deleted elements spatially contained within a parent element.

        Note: This uses bbox containment, not ElementTree hierarchy, because
        the hierarchy is based on "smallest containing bbox" which means there
        may be intermediate unlabeled elements between a parent and its
        logical children. For validation rules about spatial containment,
        bbox checking is more appropriate.

        Args:
            parent: The parent element to search within
            label: Optional label filter (e.g., "part_image")

        Returns:
            List of non-deleted Elements matching the label (if specified) that
            are fully contained within the parent's bbox
        """
        # Use spatial containment, not hierarchy
        result = []
        for elem in self.page.blocks:
            if id(elem) in self.result.removal_reasons:
                continue
            if label is not None and self.result.get_label(elem) != label:
                continue
            if elem.bbox.fully_inside(parent.bbox):
                result.append(elem)
        return result

    def print_summary(self, logger: logging.Logger | None = None) -> None:
        """Log a summary of labeled elements.

        Args:
            logger: Logger to use (defaults to module logger)
        """
        logger = logger or log
        label_counts = defaultdict(int)
        for e in self.page.blocks:
            label = (
                self.result.get_label(e) if self.result.get_label(e) else "<unknown>"
            )
            label_counts[label] += 1

        logger.info(f"Label counts: {dict(label_counts)}")


# TODO Replace this with just results.get_blocks_by_label()


def _parts_lists(page: PageData, result: ClassificationResult) -> list[Block]:
    return [
        e
        for e in page.blocks
        if result.get_label(e) == "parts_list" and not result.is_removed(e)
    ]


# TODO Replace this with just results.get_blocks_by_label()


def _part_images(page: PageData, result: ClassificationResult) -> list[Block]:
    return [
        e
        for e in page.blocks
        if result.get_label(e) == "part_image" and not result.is_removed(e)
    ]


# TODO Replace this with just results.get_blocks_by_label()


def _part_counts(page: PageData, result: ClassificationResult) -> list[Block]:
    return [
        e
        for e in page.blocks
        if result.get_label(e) == "part_count" and not result.is_removed(e)
    ]


def _print_label_counts(page: PageData, result: ClassificationResult) -> None:
    label_counts = defaultdict(int)
    for e in page.blocks:
        label = result.get_label(e) if result.get_label(e) else "<unknown>"
        label_counts[label] += 1

    # TODO The following logging shows "defaultdict(<class 'int'>,..." figure
    # out how to avoid that.
    log.info(f"Label counts: {label_counts}")


class TestClassifierRules:
    """End-to-end rules that must hold on real pages after classification."""

    @pytest.mark.parametrize(
        "fixture_file",
        [
            f.name
            for f in (Path(__file__).parent.parent / "fixtures").glob("*_raw.json")
        ],
    )
    def test_parts_list_contains_at_least_one_part_image(
        self, fixture_file: str
    ) -> None:
        """Every labeled parts list should include at least one part image
        inside its bbox.

        This test runs on all JSON fixtures in the fixtures/ directory.
        """

        pages = _load_pages_from_fixture(fixture_file)

        for page_idx, page in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page)

            classified = ClassifiedPage(page, result)
            classified.print_summary()

            parts_lists = classified.parts_lists()
            part_images = classified.part_images()
            part_counts = classified.part_counts()

            # Debug: show all part_image labeled elements including deleted ones
            all_part_images = classified.elements_by_label(
                "part_image", include_deleted=True
            )
            log.info(
                f"Page {page_idx}: Total on page: {len(parts_lists)} parts_lists, "
                f"{len(part_images)} part_images (non-deleted), "
                f"{len(all_part_images)} total part_images, "
                f"{len(part_counts)} part_counts"
            )
            if len(all_part_images) != len(part_images):
                deleted_count = len(all_part_images) - len(part_images)
                log.warning(
                    f"  WARNING: {deleted_count} part_images are DELETED on this page"
                )
                for img in all_part_images:
                    if result.is_removed(img):
                        # Check if it's inside any parts_list
                        inside_any = any(
                            img.bbox.fully_inside(pl.bbox) for pl in parts_lists
                        )
                        location = (
                            "inside a parts_list"
                            if inside_any
                            else "outside all parts_lists"
                        )
                        log.warning(
                            f"    - Deleted PartImage id:{img.id} "
                            f"bbox:{img.bbox} ({location})"
                        )

            for parts_list in parts_lists:
                part_images_inside = classified.children_of(
                    parts_list, label="part_image"
                )
                part_counts_inside = classified.children_of(
                    parts_list, label="part_count"
                )

                # Also get ALL part_images (including deleted) to check for deletion bugs
                all_part_images_inside = []
                for elem in page.blocks:
                    if result.get_label(
                        elem
                    ) == "part_image" and elem.bbox.fully_inside(parts_list.bbox):
                        all_part_images_inside.append(elem)

                log.info(
                    f"{fixture_file} page {page_idx} PartsList id:{parts_list.id} "
                    f"bbox:{parts_list.bbox} contains:"
                )
                for img in part_images_inside:
                    log.info(f" - PartImage id:{img.id} bbox:{img.bbox}")
                for count in part_counts_inside:
                    count_text = count.text if isinstance(count, Text) else ""
                    log.info(
                        f" - PartCount id:{count.id} text:{count_text} bbox:{count.bbox}"
                    )

                # Log deleted part_images if any
                deleted_images = [
                    img for img in all_part_images_inside if result.is_removed(img)
                ]
                if deleted_images:
                    log.warning(
                        f"  WARNING: {len(deleted_images)} part_images DELETED "
                        f"inside parts_list {parts_list.id}:"
                    )
                    for img in deleted_images:
                        log.warning(
                            f"    - PartImage id:{img.id} bbox:{img.bbox} [DELETED]"
                        )

                # Debug: log all part images to see why they're not inside
                if len(part_images_inside) == 0:
                    log.info("  DEBUG: All part_images on page:")
                    for img in part_images:
                        log.info(
                            f"  - PartImage id:{img.id} bbox:{img.bbox} "
                            f"inside:{img.bbox.fully_inside(parts_list.bbox)}"
                        )
                # Each parts_list must contain at least one part_image fully inside its bbox
                assert len(part_images_inside) >= 1, (
                    f"Parts list {parts_list.id} in {fixture_file} page {page_idx} "
                    f"should contain at least one part image"
                )

                # No part_images inside a parts_list should be deleted
                assert len(deleted_images) == 0, (
                    f"Parts list {parts_list.id} in {fixture_file} page {page_idx} has "
                    f"{len(deleted_images)} deleted part_images inside it (should be 0)"
                )

                # Each parts_list must contain the same number of part_counts as
                # part_images inside it
                assert len(part_counts_inside) == len(part_images_inside), (
                    f"PartsList id:{parts_list.id} in {fixture_file} page {page_idx} "
                    f"should contain {len(part_images_inside)} PartCounts, "
                    f"found {len(part_counts_inside)}"
                )

    @pytest.mark.parametrize(
        "fixture_file",
        [
            f.name
            for f in (Path(__file__).parent.parent / "fixtures").glob("*_raw.json")
        ],
    )
    def test_parts_lists_do_not_overlap(self, fixture_file: str) -> None:
        """No two parts lists should overlap.

        Parts lists represent distinct areas of the page and should not
        have overlapping bounding boxes.
        """
        pages = _load_pages_from_fixture(fixture_file)

        for page_idx, page in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page)

            classified = ClassifiedPage(page, result)
            parts_lists = classified.parts_lists()

            # Check all pairs of parts lists for overlap
            for i, parts_list_a in enumerate(parts_lists):
                for parts_list_b in parts_lists[i + 1 :]:
                    assert not parts_list_a.bbox.overlaps(parts_list_b.bbox), (
                        f"Parts lists {parts_list_a.id} (bbox:{parts_list_a.bbox}) and "
                        f"{parts_list_b.id} (bbox:{parts_list_b.bbox}) in "
                        f"{fixture_file} page {page_idx} overlap"
                    )

    @pytest.mark.parametrize(
        "fixture_file",
        [
            f.name
            for f in (Path(__file__).parent.parent / "fixtures").glob("*_raw.json")
        ],
    )
    def test_each_part_image_is_inside_a_parts_list(self, fixture_file: str) -> None:
        """Each part image must be inside at least one parts list.

        Every part_image should be contained within a parts_list's bounding box.
        """
        pages = _load_pages_from_fixture(fixture_file)

        for page_idx, page in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page)

            classified = ClassifiedPage(page, result)
            parts_lists = classified.parts_lists()
            part_images = classified.part_images()

            for part_image in part_images:
                # Check if this part_image is inside at least one parts_list
                inside_any_parts_list = any(
                    part_image.bbox.fully_inside(pl.bbox) for pl in parts_lists
                )

                assert inside_any_parts_list, (
                    f"Part image {part_image.id} (bbox:{part_image.bbox}) in "
                    f"{fixture_file} page {page_idx} is not inside any parts_list"
                )

    @pytest.mark.parametrize(
        "fixture_file",
        [
            f.name
            for f in (Path(__file__).parent.parent / "fixtures").glob("*_raw.json")
        ],
    )
    def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:
        """No element with a label should be marked as deleted.

        If an element has been classified with a label, it should not be deleted.
        This ensures that the classification and deletion logic don't conflict.
        """
        pages = _load_pages_from_fixture(fixture_file)

        for page_idx, page in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page)

            # Find all elements that are both labeled and deleted
            labeled_and_deleted = []
            for elem in page.blocks:
                if result.get_label(elem) is not None and result.is_removed(elem):
                    labeled_and_deleted.append(elem)

            if labeled_and_deleted:
                log.error(
                    f"Found {len(labeled_and_deleted)} labeled elements that are deleted:"
                )
                for elem in labeled_and_deleted:
                    log.error(
                        f"  - {result.get_label(elem)} id:{elem.id} "
                        f"bbox:{elem.bbox} [DELETED]"
                    )

            assert len(labeled_and_deleted) == 0, (
                f"Found {len(labeled_and_deleted)} labeled elements that are "
                f"deleted in {fixture_file} page {page_idx}. "
                f"Labeled elements should not be deleted."
            )

    @pytest.mark.parametrize(
        "fixture_file",
        [
            f.name
            for f in (Path(__file__).parent.parent / "fixtures").glob("*_raw.json")
        ],
    )
    def test_each_element_has_at_most_one_winner(self, fixture_file: str) -> None:
        """Each element should have at most one winner candidate across all labels.

        An element can have multiple candidates across different labels, but only
        one of them should be marked as a winner. This ensures classification
        decisions are unambiguous.
        """
        pages = _load_pages_from_fixture(fixture_file)

        for page_idx, page in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page)

            # Track which blocks have won, and for which label
            block_to_winning_label: dict[int, str] = {}

            # Check all candidates across all labels
            all_candidates = result.get_all_candidates()
            for label, candidates in all_candidates.items():
                for candidate in candidates:
                    if not candidate.is_winner:
                        continue

                    # Skip synthetic candidates (no source block)
                    if candidate.source_block is None:
                        continue

                    block_id = candidate.source_block.id

                    # Check if this block already has a winner
                    if block_id in block_to_winning_label:
                        existing_label = block_to_winning_label[block_id]
                        pytest.fail(
                            f"Block {block_id} in {fixture_file} page {page_idx} has multiple "
                            f"winner candidates: '{existing_label}' and '{label}'. "
                            "Each block should have at most one winner."
                        )

                    block_to_winning_label[block_id] = label

1	"""Rule-based tests over real fixtures for the PDF element classifier.
2
3	This suite validates high-level invariants that must hold after classification.
4
5	Rules covered:
6	- Every parts list must contain at least one part image inside it.
7	- No two parts lists overlap.
8	- Each part image is inside a parts list.
9	- Each element has at most one winner candidate.
10
11	Real fixture(s) live under this package's fixtures/ directory.
12	"""
13
14	import logging	1✔
15	from collections import defaultdict	1✔
16	from pathlib import Path	1✔
17
18	import pytest	1✔
19
20	from build_a_long.pdf_extract.classifier import ClassificationResult, classify_elements	1✔
21	from build_a_long.pdf_extract.extractor import ExtractionResult, PageData	1✔
22	from build_a_long.pdf_extract.extractor.page_blocks import Block, Text	1✔
23
24	log = logging.getLogger(__name__)	1✔
25
26
27	def _load_pages_from_fixture(fixture_file: str) -> list[PageData]:	1✔
28	"""Load all pages from a fixture file.
29
30	Args:
31	fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')
32
33	Returns:
34	All pages from the extraction result
35
36	Raises:
37	ValueError: If the fixture contains no pages
38	"""
39	fixture_path = Path(__file__).parent.parent / "fixtures" / fixture_file	1✔
40	extraction: ExtractionResult = ExtractionResult.model_validate_json(	1✔
41	fixture_path.read_text()
42	) # type: ignore[assignment]
43
44	if not extraction.pages:	1✔
45	raise ValueError(f"No pages found in {fixture_file}")	×
46
47	return extraction.pages	1✔
48
49
50	# TODO A lot of the methods in ClassifiedPage overlap with ClassificationResult
51
52
53	class ClassifiedPage:	1✔
54	"""Wrapper around PageData providing convenient access to classified elements.
55
56	This class provides helper methods to query elements by label type and
57	supports hierarchical queries (e.g., finding children inside parent bboxes).
58	Results are cached for efficiency.
59	"""
60
61	def __init__(self, page: PageData, result: ClassificationResult):	1✔
62	"""Initialize with a classified PageData and its result.
63
64	Args:
65	page: PageData that has been run through classify_elements()
66	result: The ClassificationResult for this page
67	"""
68	self.page = page	1✔
69	self.result = result	1✔
70	self._cache: dict[str, list[Block]] = {}	1✔
71
72	def elements_by_label(	1✔
73	self, label: str, include_deleted: bool = False
74	) -> list[Block]:
75	"""Get all elements with the given label.
76
77	Args:
78	label: The label to filter by
79	include_deleted: Whether to include deleted elements
80
81	Returns:
82	List of elements with matching label
83	"""
84	cache_key = f"{label}:deleted={include_deleted}"	1✔
85	if cache_key not in self._cache:	1✔
86	if include_deleted:	1✔
87	self._cache[cache_key] = [	1✔
88	e for e in self.page.blocks if self.result.get_label(e) == label
89	]
90	else:
91	self._cache[cache_key] = [	1✔
92	e
93	for e in self.page.blocks
94	if self.result.get_label(e) == label
95	and not self.result.is_removed(e)
96	]
97	return self._cache[cache_key]	1✔
98
99	def parts_lists(self) -> list[Block]:	1✔
100	"""Get all non-deleted parts_list elements."""
101	return self.elements_by_label("parts_list")	1✔
102
103	def part_images(self) -> list[Block]:	1✔
104	"""Get all non-deleted part_image elements."""
105	return self.elements_by_label("part_image")	1✔
106
107	def part_counts(self) -> list[Block]:	1✔
108	"""Get all non-deleted part_count elements."""
109	return self.elements_by_label("part_count")	1✔
110
111	def step_numbers(self) -> list[Block]:	1✔
112	"""Get all non-deleted step_number elements."""
113	return self.elements_by_label("step_number")	×
114
115	def children_of(self, parent: Block, label: str \| None = None) -> list[Block]:	1✔
116	"""Return all non-deleted elements spatially contained within a parent element.
117
118	Note: This uses bbox containment, not ElementTree hierarchy, because
119	the hierarchy is based on "smallest containing bbox" which means there
120	may be intermediate unlabeled elements between a parent and its
121	logical children. For validation rules about spatial containment,
122	bbox checking is more appropriate.
123
124	Args:
125	parent: The parent element to search within
126	label: Optional label filter (e.g., "part_image")
127
128	Returns:
129	List of non-deleted Elements matching the label (if specified) that
130	are fully contained within the parent's bbox
131	"""
132	# Use spatial containment, not hierarchy
133	result = []	1✔
134	for elem in self.page.blocks:	1✔
135	if id(elem) in self.result.removal_reasons:	1✔
136	continue	×
137	if label is not None and self.result.get_label(elem) != label:	1✔
138	continue	1✔
139	if elem.bbox.fully_inside(parent.bbox):	1✔
140	result.append(elem)	1✔
141	return result	1✔
142
143	def print_summary(self, logger: logging.Logger \| None = None) -> None:	1✔
144	"""Log a summary of labeled elements.
145
146	Args:
147	logger: Logger to use (defaults to module logger)
148	"""
149	logger = logger or log	1✔
150	label_counts = defaultdict(int)	1✔
151	for e in self.page.blocks:	1✔
152	label = (	1✔
153	self.result.get_label(e) if self.result.get_label(e) else "<unknown>"
154	)
155	label_counts[label] += 1	1✔
156
157	logger.info(f"Label counts: {dict(label_counts)}")	1✔
158
159
160	# TODO Replace this with just results.get_blocks_by_label()
161
162
163	def _parts_lists(page: PageData, result: ClassificationResult) -> list[Block]:	1✔
164	return [	×
165	e
166	for e in page.blocks
167	if result.get_label(e) == "parts_list" and not result.is_removed(e)
168	]
169
170
171	# TODO Replace this with just results.get_blocks_by_label()
172
173
174	def _part_images(page: PageData, result: ClassificationResult) -> list[Block]:	1✔
175	return [	×
176	e
177	for e in page.blocks
178	if result.get_label(e) == "part_image" and not result.is_removed(e)
179	]
180
181
182	# TODO Replace this with just results.get_blocks_by_label()
183
184
185	def _part_counts(page: PageData, result: ClassificationResult) -> list[Block]:	1✔
186	return [	×
187	e
188	for e in page.blocks
189	if result.get_label(e) == "part_count" and not result.is_removed(e)
190	]
191
192
193	def _print_label_counts(page: PageData, result: ClassificationResult) -> None:	1✔
194	label_counts = defaultdict(int)	×
195	for e in page.blocks:	×
196	label = result.get_label(e) if result.get_label(e) else "<unknown>"	×
197	label_counts[label] += 1	×
198
199	# TODO The following logging shows "defaultdict(<class 'int'>,..." figure
200	# out how to avoid that.
201	log.info(f"Label counts: {label_counts}")	×
202
203
204	class TestClassifierRules:	1✔
205	"""End-to-end rules that must hold on real pages after classification."""
206
207	@pytest.mark.parametrize(	1✔
208	"fixture_file",
209	[
210	f.name
211	for f in (Path(__file__).parent.parent / "fixtures").glob("*_raw.json")
212	],
213	)
214	def test_parts_list_contains_at_least_one_part_image(	1✔
215	self, fixture_file: str
216	) -> None:
217	"""Every labeled parts list should include at least one part image
218	inside its bbox.
219
220	This test runs on all JSON fixtures in the fixtures/ directory.
221	"""
222
223	pages = _load_pages_from_fixture(fixture_file)	1✔
224
225	for page_idx, page in enumerate(pages):	1✔
226	# Run the full classification pipeline on the page
227	result = classify_elements(page)	1✔
228
229	classified = ClassifiedPage(page, result)	1✔
230	classified.print_summary()	1✔
231
232	parts_lists = classified.parts_lists()	1✔
233	part_images = classified.part_images()	1✔
234	part_counts = classified.part_counts()	1✔
235
236	# Debug: show all part_image labeled elements including deleted ones
237	all_part_images = classified.elements_by_label(	1✔
238	"part_image", include_deleted=True
239	)
240	log.info(	1✔
241	f"Page {page_idx}: Total on page: {len(parts_lists)} parts_lists, "
242	f"{len(part_images)} part_images (non-deleted), "
243	f"{len(all_part_images)} total part_images, "
244	f"{len(part_counts)} part_counts"
245	)
246	if len(all_part_images) != len(part_images):	1✔
247	deleted_count = len(all_part_images) - len(part_images)	×
248	log.warning(	×
249	f" WARNING: {deleted_count} part_images are DELETED on this page"
250	)
251	for img in all_part_images:	×
252	if result.is_removed(img):	×
253	# Check if it's inside any parts_list
254	inside_any = any(	×
255	img.bbox.fully_inside(pl.bbox) for pl in parts_lists
256	)
257	location = (	×
258	"inside a parts_list"
259	if inside_any
260	else "outside all parts_lists"
261	)
262	log.warning(	×
263	f" - Deleted PartImage id:{img.id} "
264	f"bbox:{img.bbox} ({location})"
265	)
266
267	for parts_list in parts_lists:	1✔
268	part_images_inside = classified.children_of(	1✔
269	parts_list, label="part_image"
270	)
271	part_counts_inside = classified.children_of(	1✔
272	parts_list, label="part_count"
273	)
274
275	# Also get ALL part_images (including deleted) to check for deletion bugs
276	all_part_images_inside = []	1✔
277	for elem in page.blocks:	1✔
278	if result.get_label(	1✔
279	elem
280	) == "part_image" and elem.bbox.fully_inside(parts_list.bbox):
281	all_part_images_inside.append(elem)	1✔
282
283	log.info(	1✔
284	f"{fixture_file} page {page_idx} PartsList id:{parts_list.id} "
285	f"bbox:{parts_list.bbox} contains:"
286	)
287	for img in part_images_inside:	1✔
288	log.info(f" - PartImage id:{img.id} bbox:{img.bbox}")	1✔
289	for count in part_counts_inside:	1✔
290	count_text = count.text if isinstance(count, Text) else ""	1✔
291	log.info(	1✔
292	f" - PartCount id:{count.id} text:{count_text} bbox:{count.bbox}"
293	)
294
295	# Log deleted part_images if any
296	deleted_images = [	1✔
297	img for img in all_part_images_inside if result.is_removed(img)
298	]
299	if deleted_images:	1✔
300	log.warning(	×
301	f" WARNING: {len(deleted_images)} part_images DELETED "
302	f"inside parts_list {parts_list.id}:"
303	)
304	for img in deleted_images:	×
305	log.warning(	×
306	f" - PartImage id:{img.id} bbox:{img.bbox} [DELETED]"
307	)
308
309	# Debug: log all part images to see why they're not inside
310	if len(part_images_inside) == 0:	1✔
311	log.info(" DEBUG: All part_images on page:")	×
312	for img in part_images:	×
313	log.info(	×
314	f" - PartImage id:{img.id} bbox:{img.bbox} "
315	f"inside:{img.bbox.fully_inside(parts_list.bbox)}"
316	)
317	# Each parts_list must contain at least one part_image fully inside its bbox
318	assert len(part_images_inside) >= 1, (	1✔
319	f"Parts list {parts_list.id} in {fixture_file} page {page_idx} "
320	f"should contain at least one part image"
321	)
322
323	# No part_images inside a parts_list should be deleted
324	assert len(deleted_images) == 0, (	1✔
325	f"Parts list {parts_list.id} in {fixture_file} page {page_idx} has "
326	f"{len(deleted_images)} deleted part_images inside it (should be 0)"
327	)
328
329	# Each parts_list must contain the same number of part_counts as
330	# part_images inside it
331	assert len(part_counts_inside) == len(part_images_inside), (	1✔
332	f"PartsList id:{parts_list.id} in {fixture_file} page {page_idx} "
333	f"should contain {len(part_images_inside)} PartCounts, "
334	f"found {len(part_counts_inside)}"
335	)
336
337	@pytest.mark.parametrize(	1✔
338	"fixture_file",
339	[
340	f.name
341	for f in (Path(__file__).parent.parent / "fixtures").glob("*_raw.json")
342	],
343	)
344	def test_parts_lists_do_not_overlap(self, fixture_file: str) -> None:	1✔
345	"""No two parts lists should overlap.
346
347	Parts lists represent distinct areas of the page and should not
348	have overlapping bounding boxes.
349	"""
350	pages = _load_pages_from_fixture(fixture_file)	1✔
351
352	for page_idx, page in enumerate(pages):	1✔
353	# Run the full classification pipeline on the page
354	result = classify_elements(page)	1✔
355
356	classified = ClassifiedPage(page, result)	1✔
357	parts_lists = classified.parts_lists()	1✔
358
359	# Check all pairs of parts lists for overlap
360	for i, parts_list_a in enumerate(parts_lists):	1✔
361	for parts_list_b in parts_lists[i + 1 :]:	1✔
362	assert not parts_list_a.bbox.overlaps(parts_list_b.bbox), (	1✔
363	f"Parts lists {parts_list_a.id} (bbox:{parts_list_a.bbox}) and "
364	f"{parts_list_b.id} (bbox:{parts_list_b.bbox}) in "
365	f"{fixture_file} page {page_idx} overlap"
366	)
367
368	@pytest.mark.parametrize(	1✔
369	"fixture_file",
370	[
371	f.name
372	for f in (Path(__file__).parent.parent / "fixtures").glob("*_raw.json")
373	],
374	)
375	def test_each_part_image_is_inside_a_parts_list(self, fixture_file: str) -> None:	1✔
376	"""Each part image must be inside at least one parts list.
377
378	Every part_image should be contained within a parts_list's bounding box.
379	"""
380	pages = _load_pages_from_fixture(fixture_file)	1✔
381
382	for page_idx, page in enumerate(pages):	1✔
383	# Run the full classification pipeline on the page
384	result = classify_elements(page)	1✔
385
386	classified = ClassifiedPage(page, result)	1✔
387	parts_lists = classified.parts_lists()	1✔
388	part_images = classified.part_images()	1✔
389
390	for part_image in part_images:	1✔
391	# Check if this part_image is inside at least one parts_list
392	inside_any_parts_list = any(	1✔
393	part_image.bbox.fully_inside(pl.bbox) for pl in parts_lists
394	)
395
396	assert inside_any_parts_list, (	1✔
397	f"Part image {part_image.id} (bbox:{part_image.bbox}) in "
398	f"{fixture_file} page {page_idx} is not inside any parts_list"
399	)
400
401	@pytest.mark.parametrize(	1✔
402	"fixture_file",
403	[
404	f.name
405	for f in (Path(__file__).parent.parent / "fixtures").glob("*_raw.json")
406	],
407	)
408	def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:	1✔
409	"""No element with a label should be marked as deleted.
410
411	If an element has been classified with a label, it should not be deleted.
412	This ensures that the classification and deletion logic don't conflict.
413	"""
414	pages = _load_pages_from_fixture(fixture_file)	1✔
415
416	for page_idx, page in enumerate(pages):	1✔
417	# Run the full classification pipeline on the page
418	result = classify_elements(page)	1✔
419
420	# Find all elements that are both labeled and deleted
421	labeled_and_deleted = []	1✔
422	for elem in page.blocks:	1✔
423	if result.get_label(elem) is not None and result.is_removed(elem):	1✔
424	labeled_and_deleted.append(elem)	×
425
426	if labeled_and_deleted:	1✔
427	log.error(	×
428	f"Found {len(labeled_and_deleted)} labeled elements that are deleted:"
429	)
430	for elem in labeled_and_deleted:	×
431	log.error(	×
432	f" - {result.get_label(elem)} id:{elem.id} "
433	f"bbox:{elem.bbox} [DELETED]"
434	)
435
436	assert len(labeled_and_deleted) == 0, (	1✔
437	f"Found {len(labeled_and_deleted)} labeled elements that are "
438	f"deleted in {fixture_file} page {page_idx}. "
439	f"Labeled elements should not be deleted."
440	)
441
442	@pytest.mark.parametrize(	1✔
443	"fixture_file",
444	[
445	f.name
446	for f in (Path(__file__).parent.parent / "fixtures").glob("*_raw.json")
447	],
448	)
449	def test_each_element_has_at_most_one_winner(self, fixture_file: str) -> None:	1✔
450	"""Each element should have at most one winner candidate across all labels.
451
452	An element can have multiple candidates across different labels, but only
453	one of them should be marked as a winner. This ensures classification
454	decisions are unambiguous.
455	"""
456	pages = _load_pages_from_fixture(fixture_file)	1✔
457
458	for page_idx, page in enumerate(pages):	1✔
459	# Run the full classification pipeline on the page
460	result = classify_elements(page)	1✔
461
462	# Track which blocks have won, and for which label
463	block_to_winning_label: dict[int, str] = {}	1✔
464
465	# Check all candidates across all labels
466	all_candidates = result.get_all_candidates()	1✔
467	for label, candidates in all_candidates.items():	1✔
468	for candidate in candidates:	1✔
469	if not candidate.is_winner:	1✔
470	continue	1✔
471
472	# Skip synthetic candidates (no source block)
473	if candidate.source_block is None:	1✔
474	continue	1✔
475
476	block_id = candidate.source_block.id	1✔
477
478	# Check if this block already has a winner
479	if block_id in block_to_winning_label:	1✔
480	existing_label = block_to_winning_label[block_id]	×
481	pytest.fail(	×
482	f"Block {block_id} in {fixture_file} page {page_idx} has multiple "
483	f"winner candidates: '{existing_label}' and '{label}'. "
484	"Each block should have at most one winner."
485	)
486
487	block_to_winning_label[block_id] = label	1✔

bramp / build-along / 19251794703

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous