19060277498

Committed 04 Nov 2025 06:46AM UTC coverage: 84.229% (-0.02%) from 84.251%

Build # 19060277498

Build Type

push

github

Committed by

bramp

Commit Message

Chore: Enabled some more lint checks.

src/build_a_long/downloader/legocom.py # modified:  src/build_a_long/downloader/metadata.py # modified:  src/build_a_long/pdf_extract/analyze_classifier.py # modified:
src/build_a_long/pdf_extract/classifier/classification_result.py # modified:  src/build_a_long/pdf_extract/classifier/classifier.py # modified:
src/build_a_long/pdf_extract/classifier/classifier_golden_test.py # modified:  src/build_a_long/pdf_extract/classifier/classifier_rules_test.py # modified:
src/build_a_long/pdf_extract/classifier/classifier_test.py # modified:  src/build_a_long/pdf_extract/classifier/hierarchy_builder.py # modified:
src/build_a_long/pdf_extract/classifier/hierarchy_builder_test.py # modified:  src/build_a_long/pdf_extract/classifier/label_classifier.py # modified:
src/build_a_long/pdf_extract/classifier/lego_page_builder.py # modified:  src/build_a_long/pdf_extract/classifier/lego_page_builder_test.py # modified:
src/build_a_long/pdf_extract/classifier/page_number_classifier.py # modified:  src/build_a_long/pdf_extract/classifier/part_count_classifier.py # modified:
src/build_a_long/pdf_extract/classifier/parts_image_classifier.py # modified:  src/build_a_long/pdf_extract/classifier/parts_list_classifier.py # modified:
src/build_a_long/pdf_extract/classifier/step_classifier.py # modified:  src/build_a_long/pdf_extract/classifier/step_number_classifier.py # modified:
src/build_a_long/pdf_extract/classifier/text_extractors.py # modified:  src/build_a_long/pdf_extract/extractor/bbox.py # modified:  src/build_a_long/pdf_extract/extractor/extractor.py # modified:
src/build_a_long/pdf_extract/extractor/hierarchy.py # modified:  src/build_a_long/pdf_extract/extractor/lego_page_elements.py # modified:  src/build_a_long/pdf_extract/extractor/page_data_json_test.py #
modified:  src/build_a_long/pdf_extract/extractor/page_elements.py # modified:  src/build_a_long/pdf_extract/extractor/pymupdf_types.py # modified:  src/build_a_long/pdf... (continued)

Run Details

164 of 175 new or added lines in 28 files covered. (93.71%)

255 existing lines in 12 files now uncovered.

3573 of 4242 relevant lines covered (84.23%)

0.84 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

22.3

/src/build_a_long/pdf_extract/classifier/classifier_rules_test.py

"""Rule-based tests over real fixtures for the PDF element classifier.

This suite validates high-level invariants that must hold after classification.

Rules covered:
- Every parts list must contain at least one part image inside it.
- No two parts lists overlap.
- Each part image is inside a parts list.

Real fixture(s) live under this package's fixtures/ directory.
"""

import logging
from collections import defaultdict
from pathlib import Path

import pytest

from build_a_long.pdf_extract.classifier import ClassificationResult, classify_elements
from build_a_long.pdf_extract.extractor import PageData
from build_a_long.pdf_extract.extractor.page_elements import Element, Text

log = logging.getLogger(__name__)

# TODO A lot of the methods in ClassifiedPage overlap with ClassificationResult


class ClassifiedPage:
    """Wrapper around PageData providing convenient access to classified elements.

    This class provides helper methods to query elements by label type and
    supports hierarchical queries (e.g., finding children inside parent bboxes).
    Results are cached for efficiency.
    """

    def __init__(self, page: PageData, result: ClassificationResult):
        """Initialize with a classified PageData and its result.

        Args:
            page: PageData that has been run through classify_elements()
            result: The ClassificationResult for this page
        """
        self.page = page
        self.result = result
        self._cache: dict[str, list[Element]] = {}

    def elements_by_label(
        self, label: str, include_deleted: bool = False
    ) -> list[Element]:
        """Get all elements with the given label.

        Args:
            label: The label to filter by
            include_deleted: Whether to include deleted elements

        Returns:
            List of elements with matching label
        """
        cache_key = f"{label}:deleted={include_deleted}"
        if cache_key not in self._cache:
            if include_deleted:
                self._cache[cache_key] = [
                    e for e in self.page.elements if self.result.get_label(e) == label
                ]
            else:
                self._cache[cache_key] = [
                    e
                    for e in self.page.elements
                    if self.result.get_label(e) == label
                    and not self.result.is_removed(e)
                ]
        return self._cache[cache_key]

    def parts_lists(self) -> list[Element]:
        """Get all non-deleted parts_list elements."""
        return self.elements_by_label("parts_list")

    def part_images(self) -> list[Element]:
        """Get all non-deleted part_image elements."""
        return self.elements_by_label("part_image")

    def part_counts(self) -> list[Element]:
        """Get all non-deleted part_count elements."""
        return self.elements_by_label("part_count")

    def step_numbers(self) -> list[Element]:
        """Get all non-deleted step_number elements."""
        return self.elements_by_label("step_number")

    def children_of(self, parent: Element, label: str | None = None) -> list[Element]:
        """Return all non-deleted elements spatially contained within a parent element.

        Note: This uses bbox containment, not ElementTree hierarchy, because the hierarchy
        is based on "smallest containing bbox" which means there may be intermediate
        unlabeled elements between a parent and its logical children. For validation
        rules about spatial containment, bbox checking is more appropriate.

        Args:
            parent: The parent element to search within
            label: Optional label filter (e.g., "part_image")

        Returns:
            List of non-deleted Elements matching the label (if specified) that
            are fully contained within the parent's bbox
        """
        # Use spatial containment, not hierarchy
        result = []
        for elem in self.page.elements:
            if id(elem) in self.result._removal_reasons:
                continue
            if label is not None and self.result.get_label(elem) != label:
                continue
            if elem.bbox.fully_inside(parent.bbox):
                result.append(elem)
        return result

    def print_summary(self, logger: logging.Logger | None = None) -> None:
        """Log a summary of labeled elements.

        Args:
            logger: Logger to use (defaults to module logger)
        """
        logger = logger or log
        label_counts = defaultdict(int)
        for e in self.page.elements:
            label = (
                self.result.get_label(e) if self.result.get_label(e) else "<unknown>"
            )
            label_counts[label] += 1

        logger.info(f"Label counts: {dict(label_counts)}")


# TODO Replace this with just results.get_elements_by_label()


def _parts_lists(page: PageData, result: ClassificationResult) -> list[Element]:
    return [
        e
        for e in page.elements
        if result.get_label(e) == "parts_list" and not result.is_removed(e)
    ]


# TODO Replace this with just results.get_elements_by_label()


def _part_images(page: PageData, result: ClassificationResult) -> list[Element]:
    return [
        e
        for e in page.elements
        if result.get_label(e) == "part_image" and not result.is_removed(e)
    ]


# TODO Replace this with just results.get_elements_by_label()


def _part_counts(page: PageData, result: ClassificationResult) -> list[Element]:
    return [
        e
        for e in page.elements
        if result.get_label(e) == "part_count" and not result.is_removed(e)
    ]


def _print_label_counts(page: PageData, result: ClassificationResult) -> None:
    label_counts = defaultdict(int)
    for e in page.elements:
        label = result.get_label(e) if result.get_label(e) else "<unknown>"
        label_counts[label] += 1

    # TODO The following logging shows "defaultdict(<class 'int'>,..." figure
    # out how to avoid that.
    log.info(f"Label counts: {label_counts}")


@pytest.mark.skip(reason="Not working yet.")
class TestClassifierRules:
    """End-to-end rules that must hold on real pages after classification."""

    @pytest.mark.parametrize(
        "fixture_file",
        [f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
    )
    def test_parts_list_contains_at_least_one_part_image(
        self, fixture_file: str
    ) -> None:
        """Every labeled parts list should include at least one part image inside its bbox.

        This test runs on all JSON fixtures in the fixtures/ directory.
        """

        fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)
        page: PageData = PageData.from_json(fixture_path.read_text())  # type: ignore[assignment]

        # Run the full classification pipeline on the page
        result = classify_elements(page)

        classified = ClassifiedPage(page, result)
        classified.print_summary()

        parts_lists = classified.parts_lists()
        part_images = classified.part_images()
        part_counts = classified.part_counts()

        # Debug: show all part_image labeled elements including deleted ones
        all_part_images = classified.elements_by_label(
            "part_image", include_deleted=True
        )
        log.info(
            f"Total on page: {len(parts_lists)} parts_lists, {len(part_images)} part_images (non-deleted), {len(all_part_images)} total part_images, {len(part_counts)} part_counts"
        )
        if len(all_part_images) != len(part_images):
            deleted_count = len(all_part_images) - len(part_images)
            log.warning(
                f"  WARNING: {deleted_count} part_images are DELETED on this page"
            )
            for img in all_part_images:
                if result.is_removed(img):
                    # Check if it's inside any parts_list
                    inside_any = any(
                        img.bbox.fully_inside(pl.bbox) for pl in parts_lists
                    )
                    location = (
                        "inside a parts_list"
                        if inside_any
                        else "outside all parts_lists"
                    )
                    log.warning(
                        f"    - Deleted PartImage id:{img.id} bbox:{img.bbox} ({location})"
                    )

        for parts_list in parts_lists:
            part_images_inside = classified.children_of(parts_list, label="part_image")
            part_counts_inside = classified.children_of(parts_list, label="part_count")

            # Also get ALL part_images (including deleted) to check for deletion bugs
            all_part_images_inside = []
            for elem in page.elements:
                if result.get_label(elem) == "part_image" and elem.bbox.fully_inside(
                    parts_list.bbox
                ):
                    all_part_images_inside.append(elem)

            log.info(
                f"{fixture_file} PartsList id:{parts_list.id} bbox:{parts_list.bbox} contains:"
            )
            for img in part_images_inside:
                log.info(f" - PartImage id:{img.id} bbox:{img.bbox}")
            for count in part_counts_inside:
                count_text = count.text if isinstance(count, Text) else ""
                log.info(
                    f" - PartCount id:{count.id} text:{count_text} bbox:{count.bbox}"
                )

            # Log deleted part_images if any
            deleted_images = [
                img for img in all_part_images_inside if result.is_removed(img)
            ]
            if deleted_images:
                log.warning(
                    f"  WARNING: {len(deleted_images)} part_images DELETED inside parts_list {parts_list.id}:"
                )
                for img in deleted_images:
                    log.warning(
                        f"    - PartImage id:{img.id} bbox:{img.bbox} [DELETED]"
                    )

            # Debug: log all part images to see why they're not inside
            if len(part_images_inside) == 0:
                log.info("  DEBUG: All part_images on page:")
                for img in part_images:
                    log.info(
                        f"  - PartImage id:{img.id} bbox:{img.bbox} inside:{img.bbox.fully_inside(parts_list.bbox)}"
                    )

            # Each parts_list must contain at least one part_image fully inside its bbox
            assert len(part_images_inside) >= 1, (
                f"Parts list {parts_list.id} in {fixture_file} should contain at least one part image"
            )

            # No part_images inside a parts_list should be deleted
            assert len(deleted_images) == 0, (
                f"Parts list {parts_list.id} in {fixture_file} has {len(deleted_images)} "
                f"deleted part_images inside it (should be 0)"
            )

            # Each parts_list must contain the same number of part_counts as
            # part_images inside it
            assert len(part_counts_inside) == len(part_images_inside), (
                f"PartsList id:{parts_list.id} in {fixture_file} should contain "
                f"{len(part_images_inside)} PartCounts, found {len(part_counts_inside)}"
            )

    @pytest.mark.parametrize(
        "fixture_file",
        [f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
    )
    def test_parts_lists_do_not_overlap(self, fixture_file: str) -> None:
        """No two parts lists should overlap.

        Parts lists represent distinct areas of the page and should not
        have overlapping bounding boxes.
        """
        fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)
        page: PageData = PageData.from_json(fixture_path.read_text())  # type: ignore[assignment]

        # Run the full classification pipeline on the page
        result = classify_elements(page)

        classified = ClassifiedPage(page, result)
        parts_lists = classified.parts_lists()

        # Check all pairs of parts lists for overlap
        for i, parts_list_a in enumerate(parts_lists):
            for parts_list_b in parts_lists[i + 1 :]:
                assert not parts_list_a.bbox.overlaps(parts_list_b.bbox), (
                    f"Parts lists {parts_list_a.id} (bbox:{parts_list_a.bbox}) and "
                    f"{parts_list_b.id} (bbox:{parts_list_b.bbox}) in {fixture_file} overlap"
                )

    @pytest.mark.parametrize(
        "fixture_file",
        [f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
    )
    def test_each_part_image_is_inside_a_parts_list(self, fixture_file: str) -> None:
        """Each part image must be inside at least one parts list.

        Every part_image should be contained within a parts_list's bounding box.
        """
        fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)
        page: PageData = PageData.from_json(fixture_path.read_text())  # type: ignore[assignment]

        # Run the full classification pipeline on the page
        result = classify_elements(page)

        classified = ClassifiedPage(page, result)
        parts_lists = classified.parts_lists()
        part_images = classified.part_images()

        for part_image in part_images:
            # Check if this part_image is inside at least one parts_list
            inside_any_parts_list = any(
                part_image.bbox.fully_inside(pl.bbox) for pl in parts_lists
            )

            assert inside_any_parts_list, (
                f"Part image {part_image.id} (bbox:{part_image.bbox}) in {fixture_file} "
                f"is not inside any parts_list"
            )

    @pytest.mark.parametrize(
        "fixture_file",
        [f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
    )
    def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:
        """No element with a label should be marked as deleted.

        If an element has been classified with a label, it should not be deleted.
        This ensures that the classification and deletion logic don't conflict.
        """
        fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)
        page: PageData = PageData.from_json(fixture_path.read_text())  # type: ignore[assignment]

        # Run the full classification pipeline on the page
        result = classify_elements(page)

        # Find all elements that are both labeled and deleted
        labeled_and_deleted = []
        for elem in page.elements:
            if result.get_label(elem) is not None and result.is_removed(elem):
                labeled_and_deleted.append(elem)

        if labeled_and_deleted:
            log.error(
                f"Found {len(labeled_and_deleted)} labeled elements that are deleted:"
            )
            for elem in labeled_and_deleted:
                log.error(
                    f"  - {result.get_label(elem)} id:{elem.id} bbox:{elem.bbox} [DELETED]"
                )

        assert len(labeled_and_deleted) == 0, (
            f"Found {len(labeled_and_deleted)} labeled elements that are deleted in {fixture_file}. "
            f"Labeled elements should not be deleted."
        )

1	"""Rule-based tests over real fixtures for the PDF element classifier.
2
3	This suite validates high-level invariants that must hold after classification.
4
5	Rules covered:
6	- Every parts list must contain at least one part image inside it.
7	- No two parts lists overlap.
8	- Each part image is inside a parts list.
9
10	Real fixture(s) live under this package's fixtures/ directory.
11	"""
12
13	import logging	1✔
14	from collections import defaultdict	1✔
15	from pathlib import Path	1✔
16
17	import pytest	1✔
18
19	from build_a_long.pdf_extract.classifier import ClassificationResult, classify_elements	1✔
20	from build_a_long.pdf_extract.extractor import PageData	1✔
21	from build_a_long.pdf_extract.extractor.page_elements import Element, Text	1✔
22
23	log = logging.getLogger(__name__)	1✔
24
25	# TODO A lot of the methods in ClassifiedPage overlap with ClassificationResult
26
27
28	class ClassifiedPage:	1✔
29	"""Wrapper around PageData providing convenient access to classified elements.
30
31	This class provides helper methods to query elements by label type and
32	supports hierarchical queries (e.g., finding children inside parent bboxes).
33	Results are cached for efficiency.
34	"""
35
36	def __init__(self, page: PageData, result: ClassificationResult):	1✔
37	"""Initialize with a classified PageData and its result.
38
39	Args:
40	page: PageData that has been run through classify_elements()
41	result: The ClassificationResult for this page
42	"""
UNCOV 43	self.page = page	×
44	self.result = result	×
NEW 45	self._cache: dict[str, list[Element]] = {}	×
46
47	def elements_by_label(	1✔
48	self, label: str, include_deleted: bool = False
49	) -> list[Element]:
50	"""Get all elements with the given label.
51
52	Args:
53	label: The label to filter by
54	include_deleted: Whether to include deleted elements
55
56	Returns:
57	List of elements with matching label
58	"""
UNCOV 59	cache_key = f"{label}:deleted={include_deleted}"	×
60	if cache_key not in self._cache:	×
61	if include_deleted:	×
62	self._cache[cache_key] = [	×
63	e for e in self.page.elements if self.result.get_label(e) == label
64	]
65	else:
UNCOV 66	self._cache[cache_key] = [	×
67	e
68	for e in self.page.elements
69	if self.result.get_label(e) == label
70	and not self.result.is_removed(e)
71	]
UNCOV 72	return self._cache[cache_key]	×
73
74	def parts_lists(self) -> list[Element]:	1✔
75	"""Get all non-deleted parts_list elements."""
UNCOV 76	return self.elements_by_label("parts_list")	×
77
78	def part_images(self) -> list[Element]:	1✔
79	"""Get all non-deleted part_image elements."""
UNCOV 80	return self.elements_by_label("part_image")	×
81
82	def part_counts(self) -> list[Element]:	1✔
83	"""Get all non-deleted part_count elements."""
UNCOV 84	return self.elements_by_label("part_count")	×
85
86	def step_numbers(self) -> list[Element]:	1✔
87	"""Get all non-deleted step_number elements."""
UNCOV 88	return self.elements_by_label("step_number")	×
89
90	def children_of(self, parent: Element, label: str \| None = None) -> list[Element]:	1✔
91	"""Return all non-deleted elements spatially contained within a parent element.
92
93	Note: This uses bbox containment, not ElementTree hierarchy, because the hierarchy
94	is based on "smallest containing bbox" which means there may be intermediate
95	unlabeled elements between a parent and its logical children. For validation
96	rules about spatial containment, bbox checking is more appropriate.
97
98	Args:
99	parent: The parent element to search within
100	label: Optional label filter (e.g., "part_image")
101
102	Returns:
103	List of non-deleted Elements matching the label (if specified) that
104	are fully contained within the parent's bbox
105	"""
106	# Use spatial containment, not hierarchy
UNCOV 107	result = []	×
108	for elem in self.page.elements:	×
109	if id(elem) in self.result._removal_reasons:	×
110	continue	×
111	if label is not None and self.result.get_label(elem) != label:	×
112	continue	×
113	if elem.bbox.fully_inside(parent.bbox):	×
114	result.append(elem)	×
115	return result	×
116
117	def print_summary(self, logger: logging.Logger \| None = None) -> None:	1✔
118	"""Log a summary of labeled elements.
119
120	Args:
121	logger: Logger to use (defaults to module logger)
122	"""
UNCOV 123	logger = logger or log	×
124	label_counts = defaultdict(int)	×
125	for e in self.page.elements:	×
126	label = (	×
127	self.result.get_label(e) if self.result.get_label(e) else "<unknown>"
128	)
UNCOV 129	label_counts[label] += 1	×
130
UNCOV 131	logger.info(f"Label counts: {dict(label_counts)}")	×
132
133
134	# TODO Replace this with just results.get_elements_by_label()
135
136
137	def _parts_lists(page: PageData, result: ClassificationResult) -> list[Element]:	1✔
UNCOV 138	return [	×
139	e
140	for e in page.elements
141	if result.get_label(e) == "parts_list" and not result.is_removed(e)
142	]
143
144
145	# TODO Replace this with just results.get_elements_by_label()
146
147
148	def _part_images(page: PageData, result: ClassificationResult) -> list[Element]:	1✔
UNCOV 149	return [	×
150	e
151	for e in page.elements
152	if result.get_label(e) == "part_image" and not result.is_removed(e)
153	]
154
155
156	# TODO Replace this with just results.get_elements_by_label()
157
158
159	def _part_counts(page: PageData, result: ClassificationResult) -> list[Element]:	1✔
UNCOV 160	return [	×
161	e
162	for e in page.elements
163	if result.get_label(e) == "part_count" and not result.is_removed(e)
164	]
165
166
167	def _print_label_counts(page: PageData, result: ClassificationResult) -> None:	1✔
UNCOV 168	label_counts = defaultdict(int)	×
169	for e in page.elements:	×
170	label = result.get_label(e) if result.get_label(e) else "<unknown>"	×
171	label_counts[label] += 1	×
172
173	# TODO The following logging shows "defaultdict(<class 'int'>,..." figure
174	# out how to avoid that.
UNCOV 175	log.info(f"Label counts: {label_counts}")	×
176
177
178	@pytest.mark.skip(reason="Not working yet.")	1✔
179	class TestClassifierRules:	1✔
180	"""End-to-end rules that must hold on real pages after classification."""
181
182	@pytest.mark.parametrize(	1✔
183	"fixture_file",
184	[f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
185	)
186	def test_parts_list_contains_at_least_one_part_image(	1✔
187	self, fixture_file: str
188	) -> None:
189	"""Every labeled parts list should include at least one part image inside its bbox.
190
191	This test runs on all JSON fixtures in the fixtures/ directory.
192	"""
193
UNCOV 194	fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)	×
195	page: PageData = PageData.from_json(fixture_path.read_text()) # type: ignore[assignment]	×
196
197	# Run the full classification pipeline on the page
UNCOV 198	result = classify_elements(page)	×
199
UNCOV 200	classified = ClassifiedPage(page, result)	×
201	classified.print_summary()	×
202
UNCOV 203	parts_lists = classified.parts_lists()	×
204	part_images = classified.part_images()	×
205	part_counts = classified.part_counts()	×
206
207	# Debug: show all part_image labeled elements including deleted ones
UNCOV 208	all_part_images = classified.elements_by_label(	×
209	"part_image", include_deleted=True
210	)
UNCOV 211	log.info(	×
212	f"Total on page: {len(parts_lists)} parts_lists, {len(part_images)} part_images (non-deleted), {len(all_part_images)} total part_images, {len(part_counts)} part_counts"
213	)
UNCOV 214	if len(all_part_images) != len(part_images):	×
215	deleted_count = len(all_part_images) - len(part_images)	×
216	log.warning(	×
217	f" WARNING: {deleted_count} part_images are DELETED on this page"
218	)
UNCOV 219	for img in all_part_images:	×
220	if result.is_removed(img):	×
221	# Check if it's inside any parts_list
UNCOV 222	inside_any = any(	×
223	img.bbox.fully_inside(pl.bbox) for pl in parts_lists
224	)
UNCOV 225	location = (	×
226	"inside a parts_list"
227	if inside_any
228	else "outside all parts_lists"
229	)
UNCOV 230	log.warning(	×
231	f" - Deleted PartImage id:{img.id} bbox:{img.bbox} ({location})"
232	)
233
UNCOV 234	for parts_list in parts_lists:	×
235	part_images_inside = classified.children_of(parts_list, label="part_image")	×
236	part_counts_inside = classified.children_of(parts_list, label="part_count")	×
237
238	# Also get ALL part_images (including deleted) to check for deletion bugs
UNCOV 239	all_part_images_inside = []	×
240	for elem in page.elements:	×
241	if result.get_label(elem) == "part_image" and elem.bbox.fully_inside(	×
242	parts_list.bbox
243	):
UNCOV 244	all_part_images_inside.append(elem)	×
245
UNCOV 246	log.info(	×
247	f"{fixture_file} PartsList id:{parts_list.id} bbox:{parts_list.bbox} contains:"
248	)
UNCOV 249	for img in part_images_inside:	×
250	log.info(f" - PartImage id:{img.id} bbox:{img.bbox}")	×
251	for count in part_counts_inside:	×
252	count_text = count.text if isinstance(count, Text) else ""	×
253	log.info(	×
254	f" - PartCount id:{count.id} text:{count_text} bbox:{count.bbox}"
255	)
256
257	# Log deleted part_images if any
UNCOV 258	deleted_images = [	×
259	img for img in all_part_images_inside if result.is_removed(img)
260	]
UNCOV 261	if deleted_images:	×
262	log.warning(	×
263	f" WARNING: {len(deleted_images)} part_images DELETED inside parts_list {parts_list.id}:"
264	)
UNCOV 265	for img in deleted_images:	×
266	log.warning(	×
267	f" - PartImage id:{img.id} bbox:{img.bbox} [DELETED]"
268	)
269
270	# Debug: log all part images to see why they're not inside
UNCOV 271	if len(part_images_inside) == 0:	×
272	log.info(" DEBUG: All part_images on page:")	×
273	for img in part_images:	×
274	log.info(	×
275	f" - PartImage id:{img.id} bbox:{img.bbox} inside:{img.bbox.fully_inside(parts_list.bbox)}"
276	)
277
278	# Each parts_list must contain at least one part_image fully inside its bbox
UNCOV 279	assert len(part_images_inside) >= 1, (	×
280	f"Parts list {parts_list.id} in {fixture_file} should contain at least one part image"
281	)
282
283	# No part_images inside a parts_list should be deleted
UNCOV 284	assert len(deleted_images) == 0, (	×
285	f"Parts list {parts_list.id} in {fixture_file} has {len(deleted_images)} "
286	f"deleted part_images inside it (should be 0)"
287	)
288
289	# Each parts_list must contain the same number of part_counts as
290	# part_images inside it
UNCOV 291	assert len(part_counts_inside) == len(part_images_inside), (	×
292	f"PartsList id:{parts_list.id} in {fixture_file} should contain "
293	f"{len(part_images_inside)} PartCounts, found {len(part_counts_inside)}"
294	)
295
296	@pytest.mark.parametrize(	1✔
297	"fixture_file",
298	[f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
299	)
300	def test_parts_lists_do_not_overlap(self, fixture_file: str) -> None:	1✔
301	"""No two parts lists should overlap.
302
303	Parts lists represent distinct areas of the page and should not
304	have overlapping bounding boxes.
305	"""
UNCOV 306	fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)	×
307	page: PageData = PageData.from_json(fixture_path.read_text()) # type: ignore[assignment]	×
308
309	# Run the full classification pipeline on the page
UNCOV 310	result = classify_elements(page)	×
311
UNCOV 312	classified = ClassifiedPage(page, result)	×
313	parts_lists = classified.parts_lists()	×
314
315	# Check all pairs of parts lists for overlap
UNCOV 316	for i, parts_list_a in enumerate(parts_lists):	×
317	for parts_list_b in parts_lists[i + 1 :]:	×
318	assert not parts_list_a.bbox.overlaps(parts_list_b.bbox), (	×
319	f"Parts lists {parts_list_a.id} (bbox:{parts_list_a.bbox}) and "
320	f"{parts_list_b.id} (bbox:{parts_list_b.bbox}) in {fixture_file} overlap"
321	)
322
323	@pytest.mark.parametrize(	1✔
324	"fixture_file",
325	[f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
326	)
327	def test_each_part_image_is_inside_a_parts_list(self, fixture_file: str) -> None:	1✔
328	"""Each part image must be inside at least one parts list.
329
330	Every part_image should be contained within a parts_list's bounding box.
331	"""
UNCOV 332	fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)	×
333	page: PageData = PageData.from_json(fixture_path.read_text()) # type: ignore[assignment]	×
334
335	# Run the full classification pipeline on the page
UNCOV 336	result = classify_elements(page)	×
337
UNCOV 338	classified = ClassifiedPage(page, result)	×
339	parts_lists = classified.parts_lists()	×
340	part_images = classified.part_images()	×
341
UNCOV 342	for part_image in part_images:	×
343	# Check if this part_image is inside at least one parts_list
UNCOV 344	inside_any_parts_list = any(	×
345	part_image.bbox.fully_inside(pl.bbox) for pl in parts_lists
346	)
347
UNCOV 348	assert inside_any_parts_list, (	×
349	f"Part image {part_image.id} (bbox:{part_image.bbox}) in {fixture_file} "
350	f"is not inside any parts_list"
351	)
352
353	@pytest.mark.parametrize(	1✔
354	"fixture_file",
355	[f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
356	)
357	def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:	1✔
358	"""No element with a label should be marked as deleted.
359
360	If an element has been classified with a label, it should not be deleted.
361	This ensures that the classification and deletion logic don't conflict.
362	"""
UNCOV 363	fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)	×
364	page: PageData = PageData.from_json(fixture_path.read_text()) # type: ignore[assignment]	×
365
366	# Run the full classification pipeline on the page
UNCOV 367	result = classify_elements(page)	×
368
369	# Find all elements that are both labeled and deleted
UNCOV 370	labeled_and_deleted = []	×
371	for elem in page.elements:	×
372	if result.get_label(elem) is not None and result.is_removed(elem):	×
373	labeled_and_deleted.append(elem)	×
374
UNCOV 375	if labeled_and_deleted:	×
376	log.error(	×
377	f"Found {len(labeled_and_deleted)} labeled elements that are deleted:"
378	)
UNCOV 379	for elem in labeled_and_deleted:	×
380	log.error(	×
381	f" - {result.get_label(elem)} id:{elem.id} bbox:{elem.bbox} [DELETED]"
382	)
383
UNCOV 384	assert len(labeled_and_deleted) == 0, (	×
385	f"Found {len(labeled_and_deleted)} labeled elements that are deleted in {fixture_file}. "
386	f"Labeled elements should not be deleted."
387	)

bramp / build-along / 19060277498

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous