19155446196

Committed 06 Nov 2025 04:44AM UTC coverage: 85.36% (-0.02%) from 85.381%

Build # 19155446196

Build Type

push

github

Committed by

bramp

Commit Message

refactor: complete Block rename and terminology cleanup

Renamed page_elements.py → page_blocks.py and systematically updated all
references to use 'block' terminology for raw PDF primitives throughout
the codebase.

Key changes:
- Renamed Element class → Block in page_blocks.py
- Updated all imports and type references across 40+ files
- Renamed internal variables and method parameters:
  - _element_winners → _block_winners
  - _validate_element_in_page_data() → _validate_block_in_page_data()
  - element_to_labels → block_to_labels
  - total_elements → total_blocks
  - And many more variable renames in main.py, tests, and classifiers
- Updated all docstrings, comments, and error messages
- Updated JSON fixtures to use 'blocks' instead of 'elements'
- Updated documentation (README files)

Terminology is now consistent:
- Block = raw PDF primitive (Text, Image, Drawing from pymupdf)
- Element = LEGO semantic component (Part, StepNumber, PartsList, etc.)

All 20 tests passing.

Run Details

472 of 535 new or added lines in 34 files covered. (88.22%)

6 existing lines in 3 files now uncovered.

4064 of 4761 relevant lines covered (85.36%)

0.85 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

21.02

/src/build_a_long/pdf_extract/classifier/classifier_rules_test.py

"""Rule-based tests over real fixtures for the PDF element classifier.

This suite validates high-level invariants that must hold after classification.

Rules covered:
- Every parts list must contain at least one part image inside it.
- No two parts lists overlap.
- Each part image is inside a parts list.
- Each element has at most one winner candidate.

Real fixture(s) live under this package's fixtures/ directory.
"""

import logging
from collections import defaultdict
from pathlib import Path

import pytest

from build_a_long.pdf_extract.classifier import ClassificationResult, classify_elements
from build_a_long.pdf_extract.extractor import PageData
from build_a_long.pdf_extract.extractor.page_blocks import Block, Text

log = logging.getLogger(__name__)

# TODO A lot of the methods in ClassifiedPage overlap with ClassificationResult


class ClassifiedPage:
    """Wrapper around PageData providing convenient access to classified elements.

    This class provides helper methods to query elements by label type and
    supports hierarchical queries (e.g., finding children inside parent bboxes).
    Results are cached for efficiency.
    """

    def __init__(self, page: PageData, result: ClassificationResult):
        """Initialize with a classified PageData and its result.

        Args:
            page: PageData that has been run through classify_elements()
            result: The ClassificationResult for this page
        """
        self.page = page
        self.result = result
        self._cache: dict[str, list[Block]] = {}

    def elements_by_label(
        self, label: str, include_deleted: bool = False
    ) -> list[Block]:
        """Get all elements with the given label.

        Args:
            label: The label to filter by
            include_deleted: Whether to include deleted elements

        Returns:
            List of elements with matching label
        """
        cache_key = f"{label}:deleted={include_deleted}"
        if cache_key not in self._cache:
            if include_deleted:
                self._cache[cache_key] = [
                    e for e in self.page.blocks if self.result.get_label(e) == label
                ]
            else:
                self._cache[cache_key] = [
                    e
                    for e in self.page.blocks
                    if self.result.get_label(e) == label
                    and not self.result.is_removed(e)
                ]
        return self._cache[cache_key]

    def parts_lists(self) -> list[Block]:
        """Get all non-deleted parts_list elements."""
        return self.elements_by_label("parts_list")

    def part_images(self) -> list[Block]:
        """Get all non-deleted part_image elements."""
        return self.elements_by_label("part_image")

    def part_counts(self) -> list[Block]:
        """Get all non-deleted part_count elements."""
        return self.elements_by_label("part_count")

    def step_numbers(self) -> list[Block]:
        """Get all non-deleted step_number elements."""
        return self.elements_by_label("step_number")

    def children_of(self, parent: Block, label: str | None = None) -> list[Block]:
        """Return all non-deleted elements spatially contained within a parent element.

        Note: This uses bbox containment, not ElementTree hierarchy, because the hierarchy
        is based on "smallest containing bbox" which means there may be intermediate
        unlabeled elements between a parent and its logical children. For validation
        rules about spatial containment, bbox checking is more appropriate.

        Args:
            parent: The parent element to search within
            label: Optional label filter (e.g., "part_image")

        Returns:
            List of non-deleted Elements matching the label (if specified) that
            are fully contained within the parent's bbox
        """
        # Use spatial containment, not hierarchy
        result = []
        for elem in self.page.blocks:
            if id(elem) in self.result._removal_reasons:
                continue
            if label is not None and self.result.get_label(elem) != label:
                continue
            if elem.bbox.fully_inside(parent.bbox):
                result.append(elem)
        return result

    def print_summary(self, logger: logging.Logger | None = None) -> None:
        """Log a summary of labeled elements.

        Args:
            logger: Logger to use (defaults to module logger)
        """
        logger = logger or log
        label_counts = defaultdict(int)
        for e in self.page.blocks:
            label = (
                self.result.get_label(e) if self.result.get_label(e) else "<unknown>"
            )
            label_counts[label] += 1

        logger.info(f"Label counts: {dict(label_counts)}")


# TODO Replace this with just results.get_blocks_by_label()


def _parts_lists(page: PageData, result: ClassificationResult) -> list[Block]:
    return [
        e
        for e in page.blocks
        if result.get_label(e) == "parts_list" and not result.is_removed(e)
    ]


# TODO Replace this with just results.get_blocks_by_label()


def _part_images(page: PageData, result: ClassificationResult) -> list[Block]:
    return [
        e
        for e in page.blocks
        if result.get_label(e) == "part_image" and not result.is_removed(e)
    ]


# TODO Replace this with just results.get_blocks_by_label()


def _part_counts(page: PageData, result: ClassificationResult) -> list[Block]:
    return [
        e
        for e in page.blocks
        if result.get_label(e) == "part_count" and not result.is_removed(e)
    ]


def _print_label_counts(page: PageData, result: ClassificationResult) -> None:
    label_counts = defaultdict(int)
    for e in page.blocks:
        label = result.get_label(e) if result.get_label(e) else "<unknown>"
        label_counts[label] += 1

    # TODO The following logging shows "defaultdict(<class 'int'>,..." figure
    # out how to avoid that.
    log.info(f"Label counts: {label_counts}")


@pytest.mark.skip(reason="Not working yet.")
class TestClassifierRules:
    """End-to-end rules that must hold on real pages after classification."""

    @pytest.mark.parametrize(
        "fixture_file",
        [f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
    )
    def test_parts_list_contains_at_least_one_part_image(
        self, fixture_file: str
    ) -> None:
        """Every labeled parts list should include at least one part image inside its bbox.

        This test runs on all JSON fixtures in the fixtures/ directory.
        """

        fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)
        page: PageData = PageData.from_json(fixture_path.read_text())  # type: ignore[assignment]

        # Run the full classification pipeline on the page
        result = classify_elements(page)

        classified = ClassifiedPage(page, result)
        classified.print_summary()

        parts_lists = classified.parts_lists()
        part_images = classified.part_images()
        part_counts = classified.part_counts()

        # Debug: show all part_image labeled elements including deleted ones
        all_part_images = classified.elements_by_label(
            "part_image", include_deleted=True
        )
        log.info(
            f"Total on page: {len(parts_lists)} parts_lists, {len(part_images)} part_images (non-deleted), {len(all_part_images)} total part_images, {len(part_counts)} part_counts"
        )
        if len(all_part_images) != len(part_images):
            deleted_count = len(all_part_images) - len(part_images)
            log.warning(
                f"  WARNING: {deleted_count} part_images are DELETED on this page"
            )
            for img in all_part_images:
                if result.is_removed(img):
                    # Check if it's inside any parts_list
                    inside_any = any(
                        img.bbox.fully_inside(pl.bbox) for pl in parts_lists
                    )
                    location = (
                        "inside a parts_list"
                        if inside_any
                        else "outside all parts_lists"
                    )
                    log.warning(
                        f"    - Deleted PartImage id:{img.id} bbox:{img.bbox} ({location})"
                    )

        for parts_list in parts_lists:
            part_images_inside = classified.children_of(parts_list, label="part_image")
            part_counts_inside = classified.children_of(parts_list, label="part_count")

            # Also get ALL part_images (including deleted) to check for deletion bugs
            all_part_images_inside = []
            for elem in page.blocks:
                if result.get_label(elem) == "part_image" and elem.bbox.fully_inside(
                    parts_list.bbox
                ):
                    all_part_images_inside.append(elem)

            log.info(
                f"{fixture_file} PartsList id:{parts_list.id} bbox:{parts_list.bbox} contains:"
            )
            for img in part_images_inside:
                log.info(f" - PartImage id:{img.id} bbox:{img.bbox}")
            for count in part_counts_inside:
                count_text = count.text if isinstance(count, Text) else ""
                log.info(
                    f" - PartCount id:{count.id} text:{count_text} bbox:{count.bbox}"
                )

            # Log deleted part_images if any
            deleted_images = [
                img for img in all_part_images_inside if result.is_removed(img)
            ]
            if deleted_images:
                log.warning(
                    f"  WARNING: {len(deleted_images)} part_images DELETED inside parts_list {parts_list.id}:"
                )
                for img in deleted_images:
                    log.warning(
                        f"    - PartImage id:{img.id} bbox:{img.bbox} [DELETED]"
                    )

            # Debug: log all part images to see why they're not inside
            if len(part_images_inside) == 0:
                log.info("  DEBUG: All part_images on page:")
                for img in part_images:
                    log.info(
                        f"  - PartImage id:{img.id} bbox:{img.bbox} inside:{img.bbox.fully_inside(parts_list.bbox)}"
                    )

            # Each parts_list must contain at least one part_image fully inside its bbox
            assert len(part_images_inside) >= 1, (
                f"Parts list {parts_list.id} in {fixture_file} should contain at least one part image"
            )

            # No part_images inside a parts_list should be deleted
            assert len(deleted_images) == 0, (
                f"Parts list {parts_list.id} in {fixture_file} has {len(deleted_images)} "
                f"deleted part_images inside it (should be 0)"
            )

            # Each parts_list must contain the same number of part_counts as
            # part_images inside it
            assert len(part_counts_inside) == len(part_images_inside), (
                f"PartsList id:{parts_list.id} in {fixture_file} should contain "
                f"{len(part_images_inside)} PartCounts, found {len(part_counts_inside)}"
            )

    @pytest.mark.parametrize(
        "fixture_file",
        [f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
    )
    def test_parts_lists_do_not_overlap(self, fixture_file: str) -> None:
        """No two parts lists should overlap.

        Parts lists represent distinct areas of the page and should not
        have overlapping bounding boxes.
        """
        fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)
        page: PageData = PageData.from_json(fixture_path.read_text())  # type: ignore[assignment]

        # Run the full classification pipeline on the page
        result = classify_elements(page)

        classified = ClassifiedPage(page, result)
        parts_lists = classified.parts_lists()

        # Check all pairs of parts lists for overlap
        for i, parts_list_a in enumerate(parts_lists):
            for parts_list_b in parts_lists[i + 1 :]:
                assert not parts_list_a.bbox.overlaps(parts_list_b.bbox), (
                    f"Parts lists {parts_list_a.id} (bbox:{parts_list_a.bbox}) and "
                    f"{parts_list_b.id} (bbox:{parts_list_b.bbox}) in {fixture_file} overlap"
                )

    @pytest.mark.parametrize(
        "fixture_file",
        [f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
    )
    def test_each_part_image_is_inside_a_parts_list(self, fixture_file: str) -> None:
        """Each part image must be inside at least one parts list.

        Every part_image should be contained within a parts_list's bounding box.
        """
        fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)
        page: PageData = PageData.from_json(fixture_path.read_text())  # type: ignore[assignment]

        # Run the full classification pipeline on the page
        result = classify_elements(page)

        classified = ClassifiedPage(page, result)
        parts_lists = classified.parts_lists()
        part_images = classified.part_images()

        for part_image in part_images:
            # Check if this part_image is inside at least one parts_list
            inside_any_parts_list = any(
                part_image.bbox.fully_inside(pl.bbox) for pl in parts_lists
            )

            assert inside_any_parts_list, (
                f"Part image {part_image.id} (bbox:{part_image.bbox}) in {fixture_file} "
                f"is not inside any parts_list"
            )

    @pytest.mark.parametrize(
        "fixture_file",
        [f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
    )
    def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:
        """No element with a label should be marked as deleted.

        If an element has been classified with a label, it should not be deleted.
        This ensures that the classification and deletion logic don't conflict.
        """
        fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)
        page: PageData = PageData.from_json(fixture_path.read_text())  # type: ignore[assignment]

        # Run the full classification pipeline on the page
        result = classify_elements(page)

        # Find all elements that are both labeled and deleted
        labeled_and_deleted = []
        for elem in page.blocks:
            if result.get_label(elem) is not None and result.is_removed(elem):
                labeled_and_deleted.append(elem)

        if labeled_and_deleted:
            log.error(
                f"Found {len(labeled_and_deleted)} labeled elements that are deleted:"
            )
            for elem in labeled_and_deleted:
                log.error(
                    f"  - {result.get_label(elem)} id:{elem.id} bbox:{elem.bbox} [DELETED]"
                )

        assert len(labeled_and_deleted) == 0, (
            f"Found {len(labeled_and_deleted)} labeled elements that are deleted in {fixture_file}. "
            f"Labeled elements should not be deleted."
        )

    @pytest.mark.parametrize(
        "fixture_file",
        [f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
    )
    def test_each_element_has_at_most_one_winner(self, fixture_file: str) -> None:
        """Each element should have at most one winner candidate across all labels.

        An element can have multiple candidates across different labels, but only
        one of them should be marked as a winner. This ensures classification
        decisions are unambiguous.
        """
        fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)
        page: PageData = PageData.from_json(fixture_path.read_text())  # type: ignore[assignment]

        # Run the full classification pipeline on the page
        result = classify_elements(page)

        # Track which blocks have won, and for which label
        block_to_winning_label: dict[int, str] = {}

        # Check all candidates across all labels
        all_candidates = result.get_all_candidates()
        for label, candidates in all_candidates.items():
            for candidate in candidates:
                if not candidate.is_winner:
                    continue

                # Skip synthetic candidates (no source block)
                if candidate.source_block is None:
                    continue

                block_id = candidate.source_block.id

                # Check if this block already has a winner
                if block_id in block_to_winning_label:
                    existing_label = block_to_winning_label[block_id]
                    pytest.fail(
                        f"Block {block_id} in {fixture_file} has multiple winner candidates: "
                        f"'{existing_label}' and '{label}'. Each block should have at most one winner."
                    )

                block_to_winning_label[block_id] = label

1	"""Rule-based tests over real fixtures for the PDF element classifier.
2
3	This suite validates high-level invariants that must hold after classification.
4
5	Rules covered:
6	- Every parts list must contain at least one part image inside it.
7	- No two parts lists overlap.
8	- Each part image is inside a parts list.
9	- Each element has at most one winner candidate.
10
11	Real fixture(s) live under this package's fixtures/ directory.
12	"""
13
14	import logging	1✔
15	from collections import defaultdict	1✔
16	from pathlib import Path	1✔
17
18	import pytest	1✔
19
20	from build_a_long.pdf_extract.classifier import ClassificationResult, classify_elements	1✔
21	from build_a_long.pdf_extract.extractor import PageData	1✔
22	from build_a_long.pdf_extract.extractor.page_blocks import Block, Text	1✔
23
24	log = logging.getLogger(__name__)	1✔
25
26	# TODO A lot of the methods in ClassifiedPage overlap with ClassificationResult
27
28
29	class ClassifiedPage:	1✔
30	"""Wrapper around PageData providing convenient access to classified elements.
31
32	This class provides helper methods to query elements by label type and
33	supports hierarchical queries (e.g., finding children inside parent bboxes).
34	Results are cached for efficiency.
35	"""
36
37	def __init__(self, page: PageData, result: ClassificationResult):	1✔
38	"""Initialize with a classified PageData and its result.
39
40	Args:
41	page: PageData that has been run through classify_elements()
42	result: The ClassificationResult for this page
43	"""
44	self.page = page	×
45	self.result = result	×
NEW 46	self._cache: dict[str, list[Block]] = {}	×
47
48	def elements_by_label(	1✔
49	self, label: str, include_deleted: bool = False
50	) -> list[Block]:
51	"""Get all elements with the given label.
52
53	Args:
54	label: The label to filter by
55	include_deleted: Whether to include deleted elements
56
57	Returns:
58	List of elements with matching label
59	"""
60	cache_key = f"{label}:deleted={include_deleted}"	×
61	if cache_key not in self._cache:	×
62	if include_deleted:	×
63	self._cache[cache_key] = [	×
64	e for e in self.page.blocks if self.result.get_label(e) == label
65	]
66	else:
67	self._cache[cache_key] = [	×
68	e
69	for e in self.page.blocks
70	if self.result.get_label(e) == label
71	and not self.result.is_removed(e)
72	]
73	return self._cache[cache_key]	×
74
75	def parts_lists(self) -> list[Block]:	1✔
76	"""Get all non-deleted parts_list elements."""
77	return self.elements_by_label("parts_list")	×
78
79	def part_images(self) -> list[Block]:	1✔
80	"""Get all non-deleted part_image elements."""
81	return self.elements_by_label("part_image")	×
82
83	def part_counts(self) -> list[Block]:	1✔
84	"""Get all non-deleted part_count elements."""
85	return self.elements_by_label("part_count")	×
86
87	def step_numbers(self) -> list[Block]:	1✔
88	"""Get all non-deleted step_number elements."""
89	return self.elements_by_label("step_number")	×
90
91	def children_of(self, parent: Block, label: str \| None = None) -> list[Block]:	1✔
92	"""Return all non-deleted elements spatially contained within a parent element.
93
94	Note: This uses bbox containment, not ElementTree hierarchy, because the hierarchy
95	is based on "smallest containing bbox" which means there may be intermediate
96	unlabeled elements between a parent and its logical children. For validation
97	rules about spatial containment, bbox checking is more appropriate.
98
99	Args:
100	parent: The parent element to search within
101	label: Optional label filter (e.g., "part_image")
102
103	Returns:
104	List of non-deleted Elements matching the label (if specified) that
105	are fully contained within the parent's bbox
106	"""
107	# Use spatial containment, not hierarchy
108	result = []	×
NEW 109	for elem in self.page.blocks:	×
110	if id(elem) in self.result._removal_reasons:	×
111	continue	×
112	if label is not None and self.result.get_label(elem) != label:	×
113	continue	×
114	if elem.bbox.fully_inside(parent.bbox):	×
115	result.append(elem)	×
116	return result	×
117
118	def print_summary(self, logger: logging.Logger \| None = None) -> None:	1✔
119	"""Log a summary of labeled elements.
120
121	Args:
122	logger: Logger to use (defaults to module logger)
123	"""
124	logger = logger or log	×
125	label_counts = defaultdict(int)	×
NEW 126	for e in self.page.blocks:	×
127	label = (	×
128	self.result.get_label(e) if self.result.get_label(e) else "<unknown>"
129	)
130	label_counts[label] += 1	×
131
132	logger.info(f"Label counts: {dict(label_counts)}")	×
133
134
135	# TODO Replace this with just results.get_blocks_by_label()
136
137
138	def _parts_lists(page: PageData, result: ClassificationResult) -> list[Block]:	1✔
139	return [	×
140	e
141	for e in page.blocks
142	if result.get_label(e) == "parts_list" and not result.is_removed(e)
143	]
144
145
146	# TODO Replace this with just results.get_blocks_by_label()
147
148
149	def _part_images(page: PageData, result: ClassificationResult) -> list[Block]:	1✔
150	return [	×
151	e
152	for e in page.blocks
153	if result.get_label(e) == "part_image" and not result.is_removed(e)
154	]
155
156
157	# TODO Replace this with just results.get_blocks_by_label()
158
159
160	def _part_counts(page: PageData, result: ClassificationResult) -> list[Block]:	1✔
161	return [	×
162	e
163	for e in page.blocks
164	if result.get_label(e) == "part_count" and not result.is_removed(e)
165	]
166
167
168	def _print_label_counts(page: PageData, result: ClassificationResult) -> None:	1✔
169	label_counts = defaultdict(int)	×
NEW 170	for e in page.blocks:	×
171	label = result.get_label(e) if result.get_label(e) else "<unknown>"	×
172	label_counts[label] += 1	×
173
174	# TODO The following logging shows "defaultdict(<class 'int'>,..." figure
175	# out how to avoid that.
176	log.info(f"Label counts: {label_counts}")	×
177
178
179	@pytest.mark.skip(reason="Not working yet.")	1✔
180	class TestClassifierRules:	1✔
181	"""End-to-end rules that must hold on real pages after classification."""
182
183	@pytest.mark.parametrize(	1✔
184	"fixture_file",
185	[f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
186	)
187	def test_parts_list_contains_at_least_one_part_image(	1✔
188	self, fixture_file: str
189	) -> None:
190	"""Every labeled parts list should include at least one part image inside its bbox.
191
192	This test runs on all JSON fixtures in the fixtures/ directory.
193	"""
194
195	fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)	×
196	page: PageData = PageData.from_json(fixture_path.read_text()) # type: ignore[assignment]	×
197
198	# Run the full classification pipeline on the page
199	result = classify_elements(page)	×
200
201	classified = ClassifiedPage(page, result)	×
202	classified.print_summary()	×
203
204	parts_lists = classified.parts_lists()	×
205	part_images = classified.part_images()	×
206	part_counts = classified.part_counts()	×
207
208	# Debug: show all part_image labeled elements including deleted ones
209	all_part_images = classified.elements_by_label(	×
210	"part_image", include_deleted=True
211	)
212	log.info(	×
213	f"Total on page: {len(parts_lists)} parts_lists, {len(part_images)} part_images (non-deleted), {len(all_part_images)} total part_images, {len(part_counts)} part_counts"
214	)
215	if len(all_part_images) != len(part_images):	×
216	deleted_count = len(all_part_images) - len(part_images)	×
217	log.warning(	×
218	f" WARNING: {deleted_count} part_images are DELETED on this page"
219	)
220	for img in all_part_images:	×
221	if result.is_removed(img):	×
222	# Check if it's inside any parts_list
223	inside_any = any(	×
224	img.bbox.fully_inside(pl.bbox) for pl in parts_lists
225	)
226	location = (	×
227	"inside a parts_list"
228	if inside_any
229	else "outside all parts_lists"
230	)
231	log.warning(	×
232	f" - Deleted PartImage id:{img.id} bbox:{img.bbox} ({location})"
233	)
234
235	for parts_list in parts_lists:	×
236	part_images_inside = classified.children_of(parts_list, label="part_image")	×
237	part_counts_inside = classified.children_of(parts_list, label="part_count")	×
238
239	# Also get ALL part_images (including deleted) to check for deletion bugs
240	all_part_images_inside = []	×
NEW 241	for elem in page.blocks:	×
242	if result.get_label(elem) == "part_image" and elem.bbox.fully_inside(	×
243	parts_list.bbox
244	):
245	all_part_images_inside.append(elem)	×
246
247	log.info(	×
248	f"{fixture_file} PartsList id:{parts_list.id} bbox:{parts_list.bbox} contains:"
249	)
250	for img in part_images_inside:	×
251	log.info(f" - PartImage id:{img.id} bbox:{img.bbox}")	×
252	for count in part_counts_inside:	×
253	count_text = count.text if isinstance(count, Text) else ""	×
254	log.info(	×
255	f" - PartCount id:{count.id} text:{count_text} bbox:{count.bbox}"
256	)
257
258	# Log deleted part_images if any
259	deleted_images = [	×
260	img for img in all_part_images_inside if result.is_removed(img)
261	]
262	if deleted_images:	×
263	log.warning(	×
264	f" WARNING: {len(deleted_images)} part_images DELETED inside parts_list {parts_list.id}:"
265	)
266	for img in deleted_images:	×
267	log.warning(	×
268	f" - PartImage id:{img.id} bbox:{img.bbox} [DELETED]"
269	)
270
271	# Debug: log all part images to see why they're not inside
272	if len(part_images_inside) == 0:	×
273	log.info(" DEBUG: All part_images on page:")	×
274	for img in part_images:	×
275	log.info(	×
276	f" - PartImage id:{img.id} bbox:{img.bbox} inside:{img.bbox.fully_inside(parts_list.bbox)}"
277	)
278
279	# Each parts_list must contain at least one part_image fully inside its bbox
280	assert len(part_images_inside) >= 1, (	×
281	f"Parts list {parts_list.id} in {fixture_file} should contain at least one part image"
282	)
283
284	# No part_images inside a parts_list should be deleted
285	assert len(deleted_images) == 0, (	×
286	f"Parts list {parts_list.id} in {fixture_file} has {len(deleted_images)} "
287	f"deleted part_images inside it (should be 0)"
288	)
289
290	# Each parts_list must contain the same number of part_counts as
291	# part_images inside it
292	assert len(part_counts_inside) == len(part_images_inside), (	×
293	f"PartsList id:{parts_list.id} in {fixture_file} should contain "
294	f"{len(part_images_inside)} PartCounts, found {len(part_counts_inside)}"
295	)
296
297	@pytest.mark.parametrize(	1✔
298	"fixture_file",
299	[f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
300	)
301	def test_parts_lists_do_not_overlap(self, fixture_file: str) -> None:	1✔
302	"""No two parts lists should overlap.
303
304	Parts lists represent distinct areas of the page and should not
305	have overlapping bounding boxes.
306	"""
307	fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)	×
308	page: PageData = PageData.from_json(fixture_path.read_text()) # type: ignore[assignment]	×
309
310	# Run the full classification pipeline on the page
311	result = classify_elements(page)	×
312
313	classified = ClassifiedPage(page, result)	×
314	parts_lists = classified.parts_lists()	×
315
316	# Check all pairs of parts lists for overlap
317	for i, parts_list_a in enumerate(parts_lists):	×
318	for parts_list_b in parts_lists[i + 1 :]:	×
319	assert not parts_list_a.bbox.overlaps(parts_list_b.bbox), (	×
320	f"Parts lists {parts_list_a.id} (bbox:{parts_list_a.bbox}) and "
321	f"{parts_list_b.id} (bbox:{parts_list_b.bbox}) in {fixture_file} overlap"
322	)
323
324	@pytest.mark.parametrize(	1✔
325	"fixture_file",
326	[f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
327	)
328	def test_each_part_image_is_inside_a_parts_list(self, fixture_file: str) -> None:	1✔
329	"""Each part image must be inside at least one parts list.
330
331	Every part_image should be contained within a parts_list's bounding box.
332	"""
333	fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)	×
334	page: PageData = PageData.from_json(fixture_path.read_text()) # type: ignore[assignment]	×
335
336	# Run the full classification pipeline on the page
337	result = classify_elements(page)	×
338
339	classified = ClassifiedPage(page, result)	×
340	parts_lists = classified.parts_lists()	×
341	part_images = classified.part_images()	×
342
343	for part_image in part_images:	×
344	# Check if this part_image is inside at least one parts_list
345	inside_any_parts_list = any(	×
346	part_image.bbox.fully_inside(pl.bbox) for pl in parts_lists
347	)
348
349	assert inside_any_parts_list, (	×
350	f"Part image {part_image.id} (bbox:{part_image.bbox}) in {fixture_file} "
351	f"is not inside any parts_list"
352	)
353
354	@pytest.mark.parametrize(	1✔
355	"fixture_file",
356	[f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
357	)
358	def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:	1✔
359	"""No element with a label should be marked as deleted.
360
361	If an element has been classified with a label, it should not be deleted.
362	This ensures that the classification and deletion logic don't conflict.
363	"""
364	fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)	×
365	page: PageData = PageData.from_json(fixture_path.read_text()) # type: ignore[assignment]	×
366
367	# Run the full classification pipeline on the page
368	result = classify_elements(page)	×
369
370	# Find all elements that are both labeled and deleted
371	labeled_and_deleted = []	×
NEW 372	for elem in page.blocks:	×
373	if result.get_label(elem) is not None and result.is_removed(elem):	×
374	labeled_and_deleted.append(elem)	×
375
376	if labeled_and_deleted:	×
377	log.error(	×
378	f"Found {len(labeled_and_deleted)} labeled elements that are deleted:"
379	)
380	for elem in labeled_and_deleted:	×
381	log.error(	×
382	f" - {result.get_label(elem)} id:{elem.id} bbox:{elem.bbox} [DELETED]"
383	)
384
385	assert len(labeled_and_deleted) == 0, (	×
386	f"Found {len(labeled_and_deleted)} labeled elements that are deleted in {fixture_file}. "
387	f"Labeled elements should not be deleted."
388	)
389
390	@pytest.mark.parametrize(	1✔
391	"fixture_file",
392	[f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
393	)
394	def test_each_element_has_at_most_one_winner(self, fixture_file: str) -> None:	1✔
395	"""Each element should have at most one winner candidate across all labels.
396
397	An element can have multiple candidates across different labels, but only
398	one of them should be marked as a winner. This ensures classification
399	decisions are unambiguous.
400	"""
401	fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)	×
402	page: PageData = PageData.from_json(fixture_path.read_text()) # type: ignore[assignment]	×
403
404	# Run the full classification pipeline on the page
405	result = classify_elements(page)	×
406
407	# Track which blocks have won, and for which label
NEW 408	block_to_winning_label: dict[int, str] = {}	×
409
410	# Check all candidates across all labels
411	all_candidates = result.get_all_candidates()	×
412	for label, candidates in all_candidates.items():	×
413	for candidate in candidates:	×
414	if not candidate.is_winner:	×
415	continue	×
416
417	# Skip synthetic candidates (no source block)
NEW 418	if candidate.source_block is None:	×
UNCOV 419	continue	×
420
NEW 421	block_id = candidate.source_block.id	×
422
423	# Check if this block already has a winner
NEW 424	if block_id in block_to_winning_label:	×
NEW 425	existing_label = block_to_winning_label[block_id]	×
UNCOV 426	pytest.fail(	×
427	f"Block {block_id} in {fixture_file} has multiple winner candidates: "
428	f"'{existing_label}' and '{label}'. Each block should have at most one winner."
429	)
430
NEW 431	block_to_winning_label[block_id] = label	×

bramp / build-along / 19155446196

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous