19217904329

Committed 10 Nov 2025 01:35AM UTC coverage: 87.121% (+0.7%) from 86.426%

Build # 19217904329

Build Type

push

github

Committed by

bramp

Commit Message

Bumped some dependencies.

Run Details

4600 of 5280 relevant lines covered (87.12%)

0.87 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

21.02

/src/build_a_long/pdf_extract/classifier/classifier_rules_test.py

"""Rule-based tests over real fixtures for the PDF element classifier.

This suite validates high-level invariants that must hold after classification.

Rules covered:
- Every parts list must contain at least one part image inside it.
- No two parts lists overlap.
- Each part image is inside a parts list.
- Each element has at most one winner candidate.

Real fixture(s) live under this package's fixtures/ directory.
"""

import logging
from collections import defaultdict
from pathlib import Path

import pytest

from build_a_long.pdf_extract.classifier import ClassificationResult, classify_elements
from build_a_long.pdf_extract.extractor import PageData
from build_a_long.pdf_extract.extractor.page_blocks import Block, Text

log = logging.getLogger(__name__)

# TODO A lot of the methods in ClassifiedPage overlap with ClassificationResult


class ClassifiedPage:
    """Wrapper around PageData providing convenient access to classified elements.

    This class provides helper methods to query elements by label type and
    supports hierarchical queries (e.g., finding children inside parent bboxes).
    Results are cached for efficiency.
    """

    def __init__(self, page: PageData, result: ClassificationResult):
        """Initialize with a classified PageData and its result.

        Args:
            page: PageData that has been run through classify_elements()
            result: The ClassificationResult for this page
        """
        self.page = page
        self.result = result
        self._cache: dict[str, list[Block]] = {}

    def elements_by_label(
        self, label: str, include_deleted: bool = False
    ) -> list[Block]:
        """Get all elements with the given label.

        Args:
            label: The label to filter by
            include_deleted: Whether to include deleted elements

        Returns:
            List of elements with matching label
        """
        cache_key = f"{label}:deleted={include_deleted}"
        if cache_key not in self._cache:
            if include_deleted:
                self._cache[cache_key] = [
                    e for e in self.page.blocks if self.result.get_label(e) == label
                ]
            else:
                self._cache[cache_key] = [
                    e
                    for e in self.page.blocks
                    if self.result.get_label(e) == label
                    and not self.result.is_removed(e)
                ]
        return self._cache[cache_key]

    def parts_lists(self) -> list[Block]:
        """Get all non-deleted parts_list elements."""
        return self.elements_by_label("parts_list")

    def part_images(self) -> list[Block]:
        """Get all non-deleted part_image elements."""
        return self.elements_by_label("part_image")

    def part_counts(self) -> list[Block]:
        """Get all non-deleted part_count elements."""
        return self.elements_by_label("part_count")

    def step_numbers(self) -> list[Block]:
        """Get all non-deleted step_number elements."""
        return self.elements_by_label("step_number")

    def children_of(self, parent: Block, label: str | None = None) -> list[Block]:
        """Return all non-deleted elements spatially contained within a parent element.

        Note: This uses bbox containment, not ElementTree hierarchy, because
        the hierarchy is based on "smallest containing bbox" which means there
        may be intermediate unlabeled elements between a parent and its
        logical children. For validation rules about spatial containment,
        bbox checking is more appropriate.

        Args:
            parent: The parent element to search within
            label: Optional label filter (e.g., "part_image")

        Returns:
            List of non-deleted Elements matching the label (if specified) that
            are fully contained within the parent's bbox
        """
        # Use spatial containment, not hierarchy
        result = []
        for elem in self.page.blocks:
            if id(elem) in self.result.removal_reasons:
                continue
            if label is not None and self.result.get_label(elem) != label:
                continue
            if elem.bbox.fully_inside(parent.bbox):
                result.append(elem)
        return result

    def print_summary(self, logger: logging.Logger | None = None) -> None:
        """Log a summary of labeled elements.

        Args:
            logger: Logger to use (defaults to module logger)
        """
        logger = logger or log
        label_counts = defaultdict(int)
        for e in self.page.blocks:
            label = (
                self.result.get_label(e) if self.result.get_label(e) else "<unknown>"
            )
            label_counts[label] += 1

        logger.info(f"Label counts: {dict(label_counts)}")


# TODO Replace this with just results.get_blocks_by_label()


def _parts_lists(page: PageData, result: ClassificationResult) -> list[Block]:
    return [
        e
        for e in page.blocks
        if result.get_label(e) == "parts_list" and not result.is_removed(e)
    ]


# TODO Replace this with just results.get_blocks_by_label()


def _part_images(page: PageData, result: ClassificationResult) -> list[Block]:
    return [
        e
        for e in page.blocks
        if result.get_label(e) == "part_image" and not result.is_removed(e)
    ]


# TODO Replace this with just results.get_blocks_by_label()


def _part_counts(page: PageData, result: ClassificationResult) -> list[Block]:
    return [
        e
        for e in page.blocks
        if result.get_label(e) == "part_count" and not result.is_removed(e)
    ]


def _print_label_counts(page: PageData, result: ClassificationResult) -> None:
    label_counts = defaultdict(int)
    for e in page.blocks:
        label = result.get_label(e) if result.get_label(e) else "<unknown>"
        label_counts[label] += 1

    # TODO The following logging shows "defaultdict(<class 'int'>,..." figure
    # out how to avoid that.
    log.info(f"Label counts: {label_counts}")


@pytest.mark.skip(reason="Not working yet.")
class TestClassifierRules:
    """End-to-end rules that must hold on real pages after classification."""

    @pytest.mark.parametrize(
        "fixture_file",
        [f.name for f in (Path(__file__).parent.parent / "fixtures").glob("*.json")],
    )
    def test_parts_list_contains_at_least_one_part_image(
        self, fixture_file: str
    ) -> None:
        """Every labeled parts list should include at least one part image
        inside its bbox.

        This test runs on all JSON fixtures in the fixtures/ directory.
        """

        fixture_path = Path(__file__).parent.parent / "fixtures" / fixture_file
        page: PageData = PageData.model_validate_json(fixture_path.read_text())

        # Run the full classification pipeline on the page
        result = classify_elements(page)

        classified = ClassifiedPage(page, result)
        classified.print_summary()

        parts_lists = classified.parts_lists()
        part_images = classified.part_images()
        part_counts = classified.part_counts()

        # Debug: show all part_image labeled elements including deleted ones
        all_part_images = classified.elements_by_label(
            "part_image", include_deleted=True
        )
        log.info(
            f"Total on page: {len(parts_lists)} parts_lists, "
            f"{len(part_images)} part_images (non-deleted), "
            f"{len(all_part_images)} total part_images, "
            f"{len(part_counts)} part_counts"
        )
        if len(all_part_images) != len(part_images):
            deleted_count = len(all_part_images) - len(part_images)
            log.warning(
                f"  WARNING: {deleted_count} part_images are DELETED on this page"
            )
            for img in all_part_images:
                if result.is_removed(img):
                    # Check if it's inside any parts_list
                    inside_any = any(
                        img.bbox.fully_inside(pl.bbox) for pl in parts_lists
                    )
                    location = (
                        "inside a parts_list"
                        if inside_any
                        else "outside all parts_lists"
                    )
                    log.warning(
                        f"    - Deleted PartImage id:{img.id} "
                        f"bbox:{img.bbox} ({location})"
                    )

        for parts_list in parts_lists:
            part_images_inside = classified.children_of(parts_list, label="part_image")
            part_counts_inside = classified.children_of(parts_list, label="part_count")

            # Also get ALL part_images (including deleted) to check for deletion bugs
            all_part_images_inside = []
            for elem in page.blocks:
                if result.get_label(elem) == "part_image" and elem.bbox.fully_inside(
                    parts_list.bbox
                ):
                    all_part_images_inside.append(elem)

            log.info(
                f"{fixture_file} PartsList id:{parts_list.id} "
                f"bbox:{parts_list.bbox} contains:"
            )
            for img in part_images_inside:
                log.info(f" - PartImage id:{img.id} bbox:{img.bbox}")
            for count in part_counts_inside:
                count_text = count.text if isinstance(count, Text) else ""
                log.info(
                    f" - PartCount id:{count.id} text:{count_text} bbox:{count.bbox}"
                )

            # Log deleted part_images if any
            deleted_images = [
                img for img in all_part_images_inside if result.is_removed(img)
            ]
            if deleted_images:
                log.warning(
                    f"  WARNING: {len(deleted_images)} part_images DELETED "
                    f"inside parts_list {parts_list.id}:"
                )
                for img in deleted_images:
                    log.warning(
                        f"    - PartImage id:{img.id} bbox:{img.bbox} [DELETED]"
                    )

            # Debug: log all part images to see why they're not inside
            if len(part_images_inside) == 0:
                log.info("  DEBUG: All part_images on page:")
                for img in part_images:
                    log.info(
                        f"  - PartImage id:{img.id} bbox:{img.bbox} "
                        f"inside:{img.bbox.fully_inside(parts_list.bbox)}"
                    )
            # Each parts_list must contain at least one part_image fully inside its bbox
            assert len(part_images_inside) >= 1, (
                f"Parts list {parts_list.id} in {fixture_file} should contain "
                f"at least one part image"
            )

            # No part_images inside a parts_list should be deleted
            assert len(deleted_images) == 0, (
                f"Parts list {parts_list.id} in {fixture_file} has "
                f"{len(deleted_images)} deleted part_images inside it (should be 0)"
            )

            # Each parts_list must contain the same number of part_counts as
            # part_images inside it
            assert len(part_counts_inside) == len(part_images_inside), (
                f"PartsList id:{parts_list.id} in {fixture_file} should contain "
                f"{len(part_images_inside)} PartCounts, found {len(part_counts_inside)}"
            )

    @pytest.mark.parametrize(
        "fixture_file",
        [f.name for f in (Path(__file__).parent.parent / "fixtures").glob("*.json")],
    )
    def test_parts_lists_do_not_overlap(self, fixture_file: str) -> None:
        """No two parts lists should overlap.

        Parts lists represent distinct areas of the page and should not
        have overlapping bounding boxes.
        """
        fixture_path = Path(__file__).parent.parent / "fixtures" / fixture_file
        page: PageData = PageData.model_validate_json(fixture_path.read_text())

        # Run the full classification pipeline on the page
        result = classify_elements(page)

        classified = ClassifiedPage(page, result)
        parts_lists = classified.parts_lists()

        # Check all pairs of parts lists for overlap
        for i, parts_list_a in enumerate(parts_lists):
            for parts_list_b in parts_lists[i + 1 :]:
                assert not parts_list_a.bbox.overlaps(parts_list_b.bbox), (
                    f"Parts lists {parts_list_a.id} (bbox:{parts_list_a.bbox}) and "
                    f"{parts_list_b.id} (bbox:{parts_list_b.bbox}) in "
                    f"{fixture_file} overlap"
                )

    @pytest.mark.parametrize(
        "fixture_file",
        [f.name for f in (Path(__file__).parent.parent / "fixtures").glob("*.json")],
    )
    def test_each_part_image_is_inside_a_parts_list(self, fixture_file: str) -> None:
        """Each part image must be inside at least one parts list.

        Every part_image should be contained within a parts_list's bounding box.
        """
        fixture_path = Path(__file__).parent.parent / "fixtures" / fixture_file
        page: PageData = PageData.model_validate_json(fixture_path.read_text())

        # Run the full classification pipeline on the page
        result = classify_elements(page)

        classified = ClassifiedPage(page, result)
        parts_lists = classified.parts_lists()
        part_images = classified.part_images()

        for part_image in part_images:
            # Check if this part_image is inside at least one parts_list
            inside_any_parts_list = any(
                part_image.bbox.fully_inside(pl.bbox) for pl in parts_lists
            )

            assert inside_any_parts_list, (
                f"Part image {part_image.id} (bbox:{part_image.bbox}) in "
                f"{fixture_file} is not inside any parts_list"
            )

    @pytest.mark.parametrize(
        "fixture_file",
        [f.name for f in (Path(__file__).parent.parent / "fixtures").glob("*.json")],
    )
    def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:
        """No element with a label should be marked as deleted.

        If an element has been classified with a label, it should not be deleted.
        This ensures that the classification and deletion logic don't conflict.
        """
        fixture_path = Path(__file__).parent.parent / "fixtures" / fixture_file
        page: PageData = PageData.model_validate_json(fixture_path.read_text())

        # Run the full classification pipeline on the page
        result = classify_elements(page)

        # Find all elements that are both labeled and deleted
        labeled_and_deleted = []
        for elem in page.blocks:
            if result.get_label(elem) is not None and result.is_removed(elem):
                labeled_and_deleted.append(elem)

        if labeled_and_deleted:
            log.error(
                f"Found {len(labeled_and_deleted)} labeled elements that are deleted:"
            )
            for elem in labeled_and_deleted:
                log.error(
                    f"  - {result.get_label(elem)} id:{elem.id} "
                    f"bbox:{elem.bbox} [DELETED]"
                )

        assert len(labeled_and_deleted) == 0, (
            f"Found {len(labeled_and_deleted)} labeled elements that are "
            f"deleted in {fixture_file}. Labeled elements should not be deleted."
        )

    @pytest.mark.parametrize(
        "fixture_file",
        [f.name for f in (Path(__file__).parent.parent / "fixtures").glob("*.json")],
    )
    def test_each_element_has_at_most_one_winner(self, fixture_file: str) -> None:
        """Each element should have at most one winner candidate across all labels.

        An element can have multiple candidates across different labels, but only
        one of them should be marked as a winner. This ensures classification
        decisions are unambiguous.
        """
        fixture_path = Path(__file__).parent.parent / "fixtures" / fixture_file
        page: PageData = PageData.model_validate_json(fixture_path.read_text())

        # Run the full classification pipeline on the page
        result = classify_elements(page)

        # Track which blocks have won, and for which label
        block_to_winning_label: dict[int, str] = {}

        # Check all candidates across all labels
        all_candidates = result.get_all_candidates()
        for label, candidates in all_candidates.items():
            for candidate in candidates:
                if not candidate.is_winner:
                    continue

                # Skip synthetic candidates (no source block)
                if candidate.source_block is None:
                    continue

                block_id = candidate.source_block.id

                # Check if this block already has a winner
                if block_id in block_to_winning_label:
                    existing_label = block_to_winning_label[block_id]
                    pytest.fail(
                        f"Block {block_id} in {fixture_file} has multiple "
                        f"winner candidates: '{existing_label}' and '{label}'. "
                        "Each block should have at most one winner."
                    )

                block_to_winning_label[block_id] = label

1	"""Rule-based tests over real fixtures for the PDF element classifier.
2
3	This suite validates high-level invariants that must hold after classification.
4
5	Rules covered:
6	- Every parts list must contain at least one part image inside it.
7	- No two parts lists overlap.
8	- Each part image is inside a parts list.
9	- Each element has at most one winner candidate.
10
11	Real fixture(s) live under this package's fixtures/ directory.
12	"""
13
14	import logging	1✔
15	from collections import defaultdict	1✔
16	from pathlib import Path	1✔
17
18	import pytest	1✔
19
20	from build_a_long.pdf_extract.classifier import ClassificationResult, classify_elements	1✔
21	from build_a_long.pdf_extract.extractor import PageData	1✔
22	from build_a_long.pdf_extract.extractor.page_blocks import Block, Text	1✔
23
24	log = logging.getLogger(__name__)	1✔
25
26	# TODO A lot of the methods in ClassifiedPage overlap with ClassificationResult
27
28
29	class ClassifiedPage:	1✔
30	"""Wrapper around PageData providing convenient access to classified elements.
31
32	This class provides helper methods to query elements by label type and
33	supports hierarchical queries (e.g., finding children inside parent bboxes).
34	Results are cached for efficiency.
35	"""
36
37	def __init__(self, page: PageData, result: ClassificationResult):	1✔
38	"""Initialize with a classified PageData and its result.
39
40	Args:
41	page: PageData that has been run through classify_elements()
42	result: The ClassificationResult for this page
43	"""
44	self.page = page	×
45	self.result = result	×
46	self._cache: dict[str, list[Block]] = {}	×
47
48	def elements_by_label(	1✔
49	self, label: str, include_deleted: bool = False
50	) -> list[Block]:
51	"""Get all elements with the given label.
52
53	Args:
54	label: The label to filter by
55	include_deleted: Whether to include deleted elements
56
57	Returns:
58	List of elements with matching label
59	"""
60	cache_key = f"{label}:deleted={include_deleted}"	×
61	if cache_key not in self._cache:	×
62	if include_deleted:	×
63	self._cache[cache_key] = [	×
64	e for e in self.page.blocks if self.result.get_label(e) == label
65	]
66	else:
67	self._cache[cache_key] = [	×
68	e
69	for e in self.page.blocks
70	if self.result.get_label(e) == label
71	and not self.result.is_removed(e)
72	]
73	return self._cache[cache_key]	×
74
75	def parts_lists(self) -> list[Block]:	1✔
76	"""Get all non-deleted parts_list elements."""
77	return self.elements_by_label("parts_list")	×
78
79	def part_images(self) -> list[Block]:	1✔
80	"""Get all non-deleted part_image elements."""
81	return self.elements_by_label("part_image")	×
82
83	def part_counts(self) -> list[Block]:	1✔
84	"""Get all non-deleted part_count elements."""
85	return self.elements_by_label("part_count")	×
86
87	def step_numbers(self) -> list[Block]:	1✔
88	"""Get all non-deleted step_number elements."""
89	return self.elements_by_label("step_number")	×
90
91	def children_of(self, parent: Block, label: str \| None = None) -> list[Block]:	1✔
92	"""Return all non-deleted elements spatially contained within a parent element.
93
94	Note: This uses bbox containment, not ElementTree hierarchy, because
95	the hierarchy is based on "smallest containing bbox" which means there
96	may be intermediate unlabeled elements between a parent and its
97	logical children. For validation rules about spatial containment,
98	bbox checking is more appropriate.
99
100	Args:
101	parent: The parent element to search within
102	label: Optional label filter (e.g., "part_image")
103
104	Returns:
105	List of non-deleted Elements matching the label (if specified) that
106	are fully contained within the parent's bbox
107	"""
108	# Use spatial containment, not hierarchy
109	result = []	×
110	for elem in self.page.blocks:	×
111	if id(elem) in self.result.removal_reasons:	×
112	continue	×
113	if label is not None and self.result.get_label(elem) != label:	×
114	continue	×
115	if elem.bbox.fully_inside(parent.bbox):	×
116	result.append(elem)	×
117	return result	×
118
119	def print_summary(self, logger: logging.Logger \| None = None) -> None:	1✔
120	"""Log a summary of labeled elements.
121
122	Args:
123	logger: Logger to use (defaults to module logger)
124	"""
125	logger = logger or log	×
126	label_counts = defaultdict(int)	×
127	for e in self.page.blocks:	×
128	label = (	×
129	self.result.get_label(e) if self.result.get_label(e) else "<unknown>"
130	)
131	label_counts[label] += 1	×
132
133	logger.info(f"Label counts: {dict(label_counts)}")	×
134
135
136	# TODO Replace this with just results.get_blocks_by_label()
137
138
139	def _parts_lists(page: PageData, result: ClassificationResult) -> list[Block]:	1✔
140	return [	×
141	e
142	for e in page.blocks
143	if result.get_label(e) == "parts_list" and not result.is_removed(e)
144	]
145
146
147	# TODO Replace this with just results.get_blocks_by_label()
148
149
150	def _part_images(page: PageData, result: ClassificationResult) -> list[Block]:	1✔
151	return [	×
152	e
153	for e in page.blocks
154	if result.get_label(e) == "part_image" and not result.is_removed(e)
155	]
156
157
158	# TODO Replace this with just results.get_blocks_by_label()
159
160
161	def _part_counts(page: PageData, result: ClassificationResult) -> list[Block]:	1✔
162	return [	×
163	e
164	for e in page.blocks
165	if result.get_label(e) == "part_count" and not result.is_removed(e)
166	]
167
168
169	def _print_label_counts(page: PageData, result: ClassificationResult) -> None:	1✔
170	label_counts = defaultdict(int)	×
171	for e in page.blocks:	×
172	label = result.get_label(e) if result.get_label(e) else "<unknown>"	×
173	label_counts[label] += 1	×
174
175	# TODO The following logging shows "defaultdict(<class 'int'>,..." figure
176	# out how to avoid that.
177	log.info(f"Label counts: {label_counts}")	×
178
179
180	@pytest.mark.skip(reason="Not working yet.")	1✔
181	class TestClassifierRules:	1✔
182	"""End-to-end rules that must hold on real pages after classification."""
183
184	@pytest.mark.parametrize(	1✔
185	"fixture_file",
186	[f.name for f in (Path(__file__).parent.parent / "fixtures").glob("*.json")],
187	)
188	def test_parts_list_contains_at_least_one_part_image(	1✔
189	self, fixture_file: str
190	) -> None:
191	"""Every labeled parts list should include at least one part image
192	inside its bbox.
193
194	This test runs on all JSON fixtures in the fixtures/ directory.
195	"""
196
197	fixture_path = Path(__file__).parent.parent / "fixtures" / fixture_file	×
198	page: PageData = PageData.model_validate_json(fixture_path.read_text())	×
199
200	# Run the full classification pipeline on the page
201	result = classify_elements(page)	×
202
203	classified = ClassifiedPage(page, result)	×
204	classified.print_summary()	×
205
206	parts_lists = classified.parts_lists()	×
207	part_images = classified.part_images()	×
208	part_counts = classified.part_counts()	×
209
210	# Debug: show all part_image labeled elements including deleted ones
211	all_part_images = classified.elements_by_label(	×
212	"part_image", include_deleted=True
213	)
214	log.info(	×
215	f"Total on page: {len(parts_lists)} parts_lists, "
216	f"{len(part_images)} part_images (non-deleted), "
217	f"{len(all_part_images)} total part_images, "
218	f"{len(part_counts)} part_counts"
219	)
220	if len(all_part_images) != len(part_images):	×
221	deleted_count = len(all_part_images) - len(part_images)	×
222	log.warning(	×
223	f" WARNING: {deleted_count} part_images are DELETED on this page"
224	)
225	for img in all_part_images:	×
226	if result.is_removed(img):	×
227	# Check if it's inside any parts_list
228	inside_any = any(	×
229	img.bbox.fully_inside(pl.bbox) for pl in parts_lists
230	)
231	location = (	×
232	"inside a parts_list"
233	if inside_any
234	else "outside all parts_lists"
235	)
236	log.warning(	×
237	f" - Deleted PartImage id:{img.id} "
238	f"bbox:{img.bbox} ({location})"
239	)
240
241	for parts_list in parts_lists:	×
242	part_images_inside = classified.children_of(parts_list, label="part_image")	×
243	part_counts_inside = classified.children_of(parts_list, label="part_count")	×
244
245	# Also get ALL part_images (including deleted) to check for deletion bugs
246	all_part_images_inside = []	×
247	for elem in page.blocks:	×
248	if result.get_label(elem) == "part_image" and elem.bbox.fully_inside(	×
249	parts_list.bbox
250	):
251	all_part_images_inside.append(elem)	×
252
253	log.info(	×
254	f"{fixture_file} PartsList id:{parts_list.id} "
255	f"bbox:{parts_list.bbox} contains:"
256	)
257	for img in part_images_inside:	×
258	log.info(f" - PartImage id:{img.id} bbox:{img.bbox}")	×
259	for count in part_counts_inside:	×
260	count_text = count.text if isinstance(count, Text) else ""	×
261	log.info(	×
262	f" - PartCount id:{count.id} text:{count_text} bbox:{count.bbox}"
263	)
264
265	# Log deleted part_images if any
266	deleted_images = [	×
267	img for img in all_part_images_inside if result.is_removed(img)
268	]
269	if deleted_images:	×
270	log.warning(	×
271	f" WARNING: {len(deleted_images)} part_images DELETED "
272	f"inside parts_list {parts_list.id}:"
273	)
274	for img in deleted_images:	×
275	log.warning(	×
276	f" - PartImage id:{img.id} bbox:{img.bbox} [DELETED]"
277	)
278
279	# Debug: log all part images to see why they're not inside
280	if len(part_images_inside) == 0:	×
281	log.info(" DEBUG: All part_images on page:")	×
282	for img in part_images:	×
283	log.info(	×
284	f" - PartImage id:{img.id} bbox:{img.bbox} "
285	f"inside:{img.bbox.fully_inside(parts_list.bbox)}"
286	)
287	# Each parts_list must contain at least one part_image fully inside its bbox
288	assert len(part_images_inside) >= 1, (	×
289	f"Parts list {parts_list.id} in {fixture_file} should contain "
290	f"at least one part image"
291	)
292
293	# No part_images inside a parts_list should be deleted
294	assert len(deleted_images) == 0, (	×
295	f"Parts list {parts_list.id} in {fixture_file} has "
296	f"{len(deleted_images)} deleted part_images inside it (should be 0)"
297	)
298
299	# Each parts_list must contain the same number of part_counts as
300	# part_images inside it
301	assert len(part_counts_inside) == len(part_images_inside), (	×
302	f"PartsList id:{parts_list.id} in {fixture_file} should contain "
303	f"{len(part_images_inside)} PartCounts, found {len(part_counts_inside)}"
304	)
305
306	@pytest.mark.parametrize(	1✔
307	"fixture_file",
308	[f.name for f in (Path(__file__).parent.parent / "fixtures").glob("*.json")],
309	)
310	def test_parts_lists_do_not_overlap(self, fixture_file: str) -> None:	1✔
311	"""No two parts lists should overlap.
312
313	Parts lists represent distinct areas of the page and should not
314	have overlapping bounding boxes.
315	"""
316	fixture_path = Path(__file__).parent.parent / "fixtures" / fixture_file	×
317	page: PageData = PageData.model_validate_json(fixture_path.read_text())	×
318
319	# Run the full classification pipeline on the page
320	result = classify_elements(page)	×
321
322	classified = ClassifiedPage(page, result)	×
323	parts_lists = classified.parts_lists()	×
324
325	# Check all pairs of parts lists for overlap
326	for i, parts_list_a in enumerate(parts_lists):	×
327	for parts_list_b in parts_lists[i + 1 :]:	×
328	assert not parts_list_a.bbox.overlaps(parts_list_b.bbox), (	×
329	f"Parts lists {parts_list_a.id} (bbox:{parts_list_a.bbox}) and "
330	f"{parts_list_b.id} (bbox:{parts_list_b.bbox}) in "
331	f"{fixture_file} overlap"
332	)
333
334	@pytest.mark.parametrize(	1✔
335	"fixture_file",
336	[f.name for f in (Path(__file__).parent.parent / "fixtures").glob("*.json")],
337	)
338	def test_each_part_image_is_inside_a_parts_list(self, fixture_file: str) -> None:	1✔
339	"""Each part image must be inside at least one parts list.
340
341	Every part_image should be contained within a parts_list's bounding box.
342	"""
343	fixture_path = Path(__file__).parent.parent / "fixtures" / fixture_file	×
344	page: PageData = PageData.model_validate_json(fixture_path.read_text())	×
345
346	# Run the full classification pipeline on the page
347	result = classify_elements(page)	×
348
349	classified = ClassifiedPage(page, result)	×
350	parts_lists = classified.parts_lists()	×
351	part_images = classified.part_images()	×
352
353	for part_image in part_images:	×
354	# Check if this part_image is inside at least one parts_list
355	inside_any_parts_list = any(	×
356	part_image.bbox.fully_inside(pl.bbox) for pl in parts_lists
357	)
358
359	assert inside_any_parts_list, (	×
360	f"Part image {part_image.id} (bbox:{part_image.bbox}) in "
361	f"{fixture_file} is not inside any parts_list"
362	)
363
364	@pytest.mark.parametrize(	1✔
365	"fixture_file",
366	[f.name for f in (Path(__file__).parent.parent / "fixtures").glob("*.json")],
367	)
368	def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:	1✔
369	"""No element with a label should be marked as deleted.
370
371	If an element has been classified with a label, it should not be deleted.
372	This ensures that the classification and deletion logic don't conflict.
373	"""
374	fixture_path = Path(__file__).parent.parent / "fixtures" / fixture_file	×
375	page: PageData = PageData.model_validate_json(fixture_path.read_text())	×
376
377	# Run the full classification pipeline on the page
378	result = classify_elements(page)	×
379
380	# Find all elements that are both labeled and deleted
381	labeled_and_deleted = []	×
382	for elem in page.blocks:	×
383	if result.get_label(elem) is not None and result.is_removed(elem):	×
384	labeled_and_deleted.append(elem)	×
385
386	if labeled_and_deleted:	×
387	log.error(	×
388	f"Found {len(labeled_and_deleted)} labeled elements that are deleted:"
389	)
390	for elem in labeled_and_deleted:	×
391	log.error(	×
392	f" - {result.get_label(elem)} id:{elem.id} "
393	f"bbox:{elem.bbox} [DELETED]"
394	)
395
396	assert len(labeled_and_deleted) == 0, (	×
397	f"Found {len(labeled_and_deleted)} labeled elements that are "
398	f"deleted in {fixture_file}. Labeled elements should not be deleted."
399	)
400
401	@pytest.mark.parametrize(	1✔
402	"fixture_file",
403	[f.name for f in (Path(__file__).parent.parent / "fixtures").glob("*.json")],
404	)
405	def test_each_element_has_at_most_one_winner(self, fixture_file: str) -> None:	1✔
406	"""Each element should have at most one winner candidate across all labels.
407
408	An element can have multiple candidates across different labels, but only
409	one of them should be marked as a winner. This ensures classification
410	decisions are unambiguous.
411	"""
412	fixture_path = Path(__file__).parent.parent / "fixtures" / fixture_file	×
413	page: PageData = PageData.model_validate_json(fixture_path.read_text())	×
414
415	# Run the full classification pipeline on the page
416	result = classify_elements(page)	×
417
418	# Track which blocks have won, and for which label
419	block_to_winning_label: dict[int, str] = {}	×
420
421	# Check all candidates across all labels
422	all_candidates = result.get_all_candidates()	×
423	for label, candidates in all_candidates.items():	×
424	for candidate in candidates:	×
425	if not candidate.is_winner:	×
426	continue	×
427
428	# Skip synthetic candidates (no source block)
429	if candidate.source_block is None:	×
430	continue	×
431
432	block_id = candidate.source_block.id	×
433
434	# Check if this block already has a winner
435	if block_id in block_to_winning_label:	×
436	existing_label = block_to_winning_label[block_id]	×
437	pytest.fail(	×
438	f"Block {block_id} in {fixture_file} has multiple "
439	f"winner candidates: '{existing_label}' and '{label}'. "
440	"Each block should have at most one winner."
441	)
442
443	block_to_winning_label[block_id] = label	×

bramp / build-along / 19217904329

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous