19257583787

Committed 11 Nov 2025 06:52AM UTC coverage: 91.217% (+0.5%) from 90.748%

Build # 19257583787

Build Type

push

github

Committed by

bramp

Commit Message

feat(pdf_extract): Update lego_page_layout tool

- Add support for ProgressBar and PartNumber elements.
- Remove NewBag and BagNumber from the example.
- Adjust ProgressBar to be left-aligned with steps and have a margin.
- Regenerate the layout diagram.

Run Details

4923 of 5397 relevant lines covered (91.22%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

83.33

/src/build_a_long/pdf_extract/classifier/classifier_rules_test.py

"""Rule-based tests over real fixtures for the PDF element classifier.

This suite validates high-level invariants that must hold after classification.

Rules covered:
- Every parts list must contain at least one part image inside it.
- No two parts lists overlap.
- Each part image is inside a parts list.
- Each element has at most one winner candidate.

Real fixture(s) live under this package's fixtures/ directory.
"""

import logging
from collections import defaultdict
from pathlib import Path

import pytest

from build_a_long.pdf_extract.classifier import ClassificationResult, classify_elements
from build_a_long.pdf_extract.extractor import ExtractionResult, PageData
from build_a_long.pdf_extract.extractor.lego_page_elements import (
    LegoPageElement,
    Page,
    Part,
    PartsList,
    Step,
)
from build_a_long.pdf_extract.extractor.page_blocks import Block, Text
from build_a_long.pdf_extract.fixtures import RAW_FIXTURE_FILES

log = logging.getLogger(__name__)


def _load_pages_from_fixture(fixture_file: str) -> list[PageData]:
    """Load all pages from a fixture file.

    Args:
        fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')

    Returns:
        All pages from the extraction result

    Raises:
        ValueError: If the fixture contains no pages
    """
    fixture_path = Path(__file__).parent.parent / "fixtures" / fixture_file
    extraction: ExtractionResult = ExtractionResult.model_validate_json(
        fixture_path.read_text()
    )  # type: ignore[assignment]

    if not extraction.pages:
        raise ValueError(f"No pages found in {fixture_file}")

    return extraction.pages


# TODO A lot of the methods in ClassifiedPage overlap with ClassificationResult


class ClassifiedPage:
    """Wrapper around PageData providing convenient access to classified elements.

    This class provides helper methods to query elements by label type and
    supports hierarchical queries (e.g., finding children inside parent bboxes).
    Results are cached for efficiency.
    """

    def __init__(self, page: PageData, result: ClassificationResult):
        """Initialize with a classified PageData and its result.

        Args:
            page: PageData that has been run through classify_elements()
            result: The ClassificationResult for this page
        """
        self.page = page
        self.result = result
        self._cache: dict[str, list[Block]] = {}

    def elements_by_label(
        self, label: str, include_deleted: bool = False
    ) -> list[Block]:
        """Get all elements with the given label.

        Args:
            label: The label to filter by
            include_deleted: Whether to include deleted elements

        Returns:
            List of elements with matching label
        """
        cache_key = f"{label}:deleted={include_deleted}"
        if cache_key not in self._cache:
            if include_deleted:
                self._cache[cache_key] = [
                    e for e in self.page.blocks if self.result.get_label(e) == label
                ]
            else:
                self._cache[cache_key] = [
                    e
                    for e in self.page.blocks
                    if self.result.get_label(e) == label
                    and not self.result.is_removed(e)
                ]
        return self._cache[cache_key]

    def parts_lists(self) -> list[Block]:
        """Get all non-deleted parts_list elements."""
        return self.elements_by_label("parts_list")

    def part_images(self) -> list[Block]:
        """Get all non-deleted part_image elements."""
        return self.elements_by_label("part_image")

    def part_counts(self) -> list[Block]:
        """Get all non-deleted part_count elements."""
        return self.elements_by_label("part_count")

    def step_numbers(self) -> list[Block]:
        """Get all non-deleted step_number elements."""
        return self.elements_by_label("step_number")

    def children_of(self, parent: Block, label: str | None = None) -> list[Block]:
        """Return all non-deleted elements spatially contained within a parent element.

        Note: This uses bbox containment, not ElementTree hierarchy, because
        the hierarchy is based on "smallest containing bbox" which means there
        may be intermediate unlabeled elements between a parent and its
        logical children. For validation rules about spatial containment,
        bbox checking is more appropriate.

        Args:
            parent: The parent element to search within
            label: Optional label filter (e.g., "part_image")

        Returns:
            List of non-deleted Elements matching the label (if specified) that
            are fully contained within the parent's bbox
        """
        # Use spatial containment, not hierarchy
        result = []
        for elem in self.page.blocks:
            if id(elem) in self.result.removal_reasons:
                continue
            if label is not None and self.result.get_label(elem) != label:
                continue
            if elem.bbox.fully_inside(parent.bbox):
                result.append(elem)
        return result

    def print_summary(self, logger: logging.Logger | None = None) -> None:
        """Log a summary of labeled elements.

        Args:
            logger: Logger to use (defaults to module logger)
        """
        logger = logger or log
        label_counts = defaultdict(int)
        for e in self.page.blocks:
            label = (
                self.result.get_label(e) if self.result.get_label(e) else "<unknown>"
            )
            label_counts[label] += 1

        logger.info(f"Label counts: {dict(label_counts)}")


# TODO Replace this with just results.get_blocks_by_label()


def _parts_lists(page: PageData, result: ClassificationResult) -> list[Block]:
    return [
        e
        for e in page.blocks
        if result.get_label(e) == "parts_list" and not result.is_removed(e)
    ]


# TODO Replace this with just results.get_blocks_by_label()


def _part_images(page: PageData, result: ClassificationResult) -> list[Block]:
    return [
        e
        for e in page.blocks
        if result.get_label(e) == "part_image" and not result.is_removed(e)
    ]


# TODO Replace this with just results.get_blocks_by_label()


def _part_counts(page: PageData, result: ClassificationResult) -> list[Block]:
    return [
        e
        for e in page.blocks
        if result.get_label(e) == "part_count" and not result.is_removed(e)
    ]


def _print_label_counts(page: PageData, result: ClassificationResult) -> None:
    label_counts = defaultdict(int)
    for e in page.blocks:
        label = result.get_label(e) if result.get_label(e) else "<unknown>"
        label_counts[label] += 1

    # TODO The following logging shows "defaultdict(<class 'int'>,..." figure
    # out how to avoid that.
    log.info(f"Label counts: {label_counts}")


class TestClassifierRules:
    """End-to-end rules that must hold on real pages after classification."""

    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
    def test_parts_list_contains_at_least_one_part_image(
        self, fixture_file: str
    ) -> None:
        """Every labeled parts list should include at least one part image
        inside its bbox.

        This test runs on all JSON fixtures in the fixtures/ directory.
        """

        pages = _load_pages_from_fixture(fixture_file)

        for page_idx, page in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page)

            classified = ClassifiedPage(page, result)
            classified.print_summary()

            parts_lists = classified.parts_lists()
            part_images = classified.part_images()
            part_counts = classified.part_counts()

            # Debug: show all part_image labeled elements including deleted ones
            all_part_images = classified.elements_by_label(
                "part_image", include_deleted=True
            )
            log.info(
                f"Page {page_idx}: Total on page: {len(parts_lists)} parts_lists, "
                f"{len(part_images)} part_images (non-deleted), "
                f"{len(all_part_images)} total part_images, "
                f"{len(part_counts)} part_counts"
            )
            if len(all_part_images) != len(part_images):
                deleted_count = len(all_part_images) - len(part_images)
                log.warning(
                    f"  WARNING: {deleted_count} part_images are DELETED on this page"
                )
                for img in all_part_images:
                    if result.is_removed(img):
                        # Check if it's inside any parts_list
                        inside_any = any(
                            img.bbox.fully_inside(pl.bbox) for pl in parts_lists
                        )
                        location = (
                            "inside a parts_list"
                            if inside_any
                            else "outside all parts_lists"
                        )
                        log.warning(
                            f"    - Deleted PartImage id:{img.id} "
                            f"bbox:{img.bbox} ({location})"
                        )

            for parts_list in parts_lists:
                part_images_inside = classified.children_of(
                    parts_list, label="part_image"
                )
                part_counts_inside = classified.children_of(
                    parts_list, label="part_count"
                )

                # Also get ALL part_images (including deleted) to check for deletion bugs
                all_part_images_inside = []
                for elem in page.blocks:
                    if result.get_label(
                        elem
                    ) == "part_image" and elem.bbox.fully_inside(parts_list.bbox):
                        all_part_images_inside.append(elem)

                log.info(
                    f"{fixture_file} page {page_idx} PartsList id:{parts_list.id} "
                    f"bbox:{parts_list.bbox} contains:"
                )
                for img in part_images_inside:
                    log.info(f" - PartImage id:{img.id} bbox:{img.bbox}")
                for count in part_counts_inside:
                    count_text = count.text if isinstance(count, Text) else ""
                    log.info(
                        f" - PartCount id:{count.id} text:{count_text} bbox:{count.bbox}"
                    )

                # Log deleted part_images if any
                deleted_images = [
                    img for img in all_part_images_inside if result.is_removed(img)
                ]
                if deleted_images:
                    log.warning(
                        f"  WARNING: {len(deleted_images)} part_images DELETED "
                        f"inside parts_list {parts_list.id}:"
                    )
                    for img in deleted_images:
                        log.warning(
                            f"    - PartImage id:{img.id} bbox:{img.bbox} [DELETED]"
                        )

                # Debug: log all part images to see why they're not inside
                if len(part_images_inside) == 0:
                    log.info("  DEBUG: All part_images on page:")
                    for img in part_images:
                        log.info(
                            f"  - PartImage id:{img.id} bbox:{img.bbox} "
                            f"inside:{img.bbox.fully_inside(parts_list.bbox)}"
                        )
                # Each parts_list must contain at least one part_image fully inside its bbox
                assert len(part_images_inside) >= 1, (
                    f"Parts list {parts_list.id} in {fixture_file} page {page_idx} "
                    f"should contain at least one part image"
                )

                # No part_images inside a parts_list should be deleted
                assert len(deleted_images) == 0, (
                    f"Parts list {parts_list.id} in {fixture_file} page {page_idx} has "
                    f"{len(deleted_images)} deleted part_images inside it (should be 0)"
                )

                # Each parts_list must contain the same number of part_counts as
                # part_images inside it
                assert len(part_counts_inside) == len(part_images_inside), (
                    f"PartsList id:{parts_list.id} in {fixture_file} page {page_idx} "
                    f"should contain {len(part_images_inside)} PartCounts, "
                    f"found {len(part_counts_inside)}"
                )

    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
    def test_parts_lists_do_not_overlap(self, fixture_file: str) -> None:
        """No two parts lists should overlap.

        Parts lists represent distinct areas of the page and should not
        have overlapping bounding boxes.
        """
        pages = _load_pages_from_fixture(fixture_file)

        for page_idx, page in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page)

            classified = ClassifiedPage(page, result)
            parts_lists = classified.parts_lists()

            # Check all pairs of parts lists for overlap
            for i, parts_list_a in enumerate(parts_lists):
                for parts_list_b in parts_lists[i + 1 :]:
                    assert not parts_list_a.bbox.overlaps(parts_list_b.bbox), (
                        f"Parts lists {parts_list_a.id} (bbox:{parts_list_a.bbox}) and "
                        f"{parts_list_b.id} (bbox:{parts_list_b.bbox}) in "
                        f"{fixture_file} page {page_idx} overlap"
                    )

    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
    def test_each_part_image_is_inside_a_parts_list(self, fixture_file: str) -> None:
        """Each part image must be inside at least one parts list.

        Every part_image should be contained within a parts_list's bounding box.
        """
        pages = _load_pages_from_fixture(fixture_file)

        for page_idx, page in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page)

            classified = ClassifiedPage(page, result)
            parts_lists = classified.parts_lists()
            part_images = classified.part_images()

            for part_image in part_images:
                # Check if this part_image is inside at least one parts_list
                inside_any_parts_list = any(
                    part_image.bbox.fully_inside(pl.bbox) for pl in parts_lists
                )

                assert inside_any_parts_list, (
                    f"Part image {part_image.id} (bbox:{part_image.bbox}) in "
                    f"{fixture_file} page {page_idx} is not inside any parts_list"
                )

    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
    def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:
        """No element with a label should be marked as deleted.

        If an element has been classified with a label, it should not be deleted.
        This ensures that the classification and deletion logic don't conflict.
        """
        pages = _load_pages_from_fixture(fixture_file)

        for page_idx, page in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page)

            # Find all elements that are both labeled and deleted
            labeled_and_deleted = []
            for elem in page.blocks:
                if result.get_label(elem) is not None and result.is_removed(elem):
                    labeled_and_deleted.append(elem)

            if labeled_and_deleted:
                log.error(
                    f"Found {len(labeled_and_deleted)} labeled elements that are deleted:"
                )
                for elem in labeled_and_deleted:
                    log.error(
                        f"  - {result.get_label(elem)} id:{elem.id} "
                        f"bbox:{elem.bbox} [DELETED]"
                    )

            assert len(labeled_and_deleted) == 0, (
                f"Found {len(labeled_and_deleted)} labeled elements that are "
                f"deleted in {fixture_file} page {page_idx}. "
                f"Labeled elements should not be deleted."
            )

    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
    def test_each_element_has_at_most_one_winner(self, fixture_file: str) -> None:
        """Each element should have at most one winner candidate across all labels.

        An element can have multiple candidates across different labels, but only
        one of them should be marked as a winner. This ensures classification
        decisions are unambiguous.
        """
        pages = _load_pages_from_fixture(fixture_file)

        for page_idx, page in enumerate(pages):
            # Run the full classification pipeline on the page
            result = classify_elements(page)

            # Track which blocks have won, and for which label
            block_to_winning_label: dict[int, str] = {}

            # Check all candidates across all labels
            all_candidates = result.get_all_candidates()
            for label, candidates in all_candidates.items():
                for candidate in candidates:
                    if not candidate.is_winner:
                        continue

                    # Skip synthetic candidates (no source block)
                    if candidate.source_block is None:
                        continue

                    block_id = candidate.source_block.id

                    # Check if this block already has a winner
                    if block_id in block_to_winning_label:
                        existing_label = block_to_winning_label[block_id]
                        pytest.fail(
                            f"Block {block_id} in {fixture_file} page {page_idx} has multiple "
                            f"winner candidates: '{existing_label}' and '{label}'. "
                            "Each block should have at most one winner."
                        )

                    block_to_winning_label[block_id] = label

    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
    def test_all_winners_discoverable_from_page(self, fixture_file: str) -> None:
        """All winning candidates should be discoverable from the root Page element.

        This test ensures that every winning candidate (constructed LegoPageElement)
        can be found by traversing the Page hierarchy. This validates that:
        1. The page builder properly includes all winners in the hierarchy
        2. No winning candidates are orphaned or lost during page construction
        3. The hierarchical structure is complete

        Note: Some pages are skipped due to known issues in the page builder:
        - Catalog/inventory pages (not yet supported)
        - Pages with orphaned step_numbers or part_counts (bugs to fix)

        TODO: Fix page builder bugs and remove pages from KNOWN_ISSUES skip list.
        """
        # Skip known pages with bugs in the page builder
        # TODO: Fix these bugs and remove from skip list
        KNOWN_ISSUES = [
            # Catalog/inventory pages with no steps - page builder doesn't support yet
            "6509377_page_180_raw.json",  # 178 orphaned parts/part_counts
            # Regular instruction pages with orphaned winners - bugs to fix
            "6509377_page_010_raw.json",  # 1 orphaned step_number
            "6509377_page_013_raw.json",  # 2 orphaned part_counts
            "6509377_page_014_raw.json",  # 1 orphaned part_count
            "6509377_page_015_raw.json",  # 1 orphaned part_count
        ]

        if fixture_file in KNOWN_ISSUES:
            pytest.skip(f"Skipping {fixture_file}: known page builder issue")

        pages = _load_pages_from_fixture(fixture_file)

        for page_idx, page_data in enumerate(pages):
            # Run classification
            result = classify_elements(page_data)

            # Build the Page hierarchy
            page = result.page
            if page is None:
                pytest.fail(f"Page element is None for {fixture_file} page {page_idx}")

            # Collect all constructed elements from the Page hierarchy
            discovered_elements: set[int] = set()
            stack: list[LegoPageElement] = [page]

            while stack:
                element = stack.pop()
                discovered_elements.add(id(element))

                # Page attributes
                if isinstance(element, Page):
                    if element.page_number:
                        stack.append(element.page_number)
                    if element.progress_bar:
                        stack.append(element.progress_bar)
                    stack.extend(element.steps)

                # Step attributes (all required fields)
                elif isinstance(element, Step):
                    stack.append(element.step_number)
                    stack.append(element.parts_list)
                    stack.append(element.diagram)

                # PartsList attributes
                elif isinstance(element, PartsList):
                    stack.extend(element.parts)

                # Part attributes
                # Note: Part.count is PartCount (LegoPageElement)
                # Part.diagram is Drawing | None (not LegoPageElement, so skip it)
                elif isinstance(element, Part):
                    stack.append(element.count)

            # Get all winning candidates (all types, not just structural)
            all_candidates = result.get_all_candidates()
            winning_candidates = []
            for label, candidates in all_candidates.items():
                for candidate in candidates:
                    if candidate.is_winner and candidate.constructed is not None:
                        winning_candidates.append((label, candidate))

            # Check that all winners are discoverable
            orphaned = []
            for label, candidate in winning_candidates:
                element_id = id(candidate.constructed)
                if element_id not in discovered_elements:
                    orphaned.append((label, candidate))

            if orphaned:
                log.error(
                    f"Found {len(orphaned)} winning candidates not discoverable "
                    f"from Page in {fixture_file} page {page_idx}:"
                )
                for label, candidate in orphaned:
                    log.error(
                        f"  - {label}: {candidate.constructed} "
                        f"(id={id(candidate.constructed)})"
                    )

            assert len(orphaned) == 0, (
                f"Found {len(orphaned)} winning candidates that are not "
                f"discoverable from the root Page element in {fixture_file} "
                f"page {page_idx}. All winners should be part of the "
                f"hierarchical structure."
            )

1	"""Rule-based tests over real fixtures for the PDF element classifier.
2
3	This suite validates high-level invariants that must hold after classification.
4
5	Rules covered:
6	- Every parts list must contain at least one part image inside it.
7	- No two parts lists overlap.
8	- Each part image is inside a parts list.
9	- Each element has at most one winner candidate.
10
11	Real fixture(s) live under this package's fixtures/ directory.
12	"""
13
14	import logging	1✔
15	from collections import defaultdict	1✔
16	from pathlib import Path	1✔
17
18	import pytest	1✔
19
20	from build_a_long.pdf_extract.classifier import ClassificationResult, classify_elements	1✔
21	from build_a_long.pdf_extract.extractor import ExtractionResult, PageData	1✔
22	from build_a_long.pdf_extract.extractor.lego_page_elements import (	1✔
23	LegoPageElement,
24	Page,
25	Part,
26	PartsList,
27	Step,
28	)
29	from build_a_long.pdf_extract.extractor.page_blocks import Block, Text	1✔
30	from build_a_long.pdf_extract.fixtures import RAW_FIXTURE_FILES	1✔
31
32	log = logging.getLogger(__name__)	1✔
33
34
35	def _load_pages_from_fixture(fixture_file: str) -> list[PageData]:	1✔
36	"""Load all pages from a fixture file.
37
38	Args:
39	fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')
40
41	Returns:
42	All pages from the extraction result
43
44	Raises:
45	ValueError: If the fixture contains no pages
46	"""
47	fixture_path = Path(__file__).parent.parent / "fixtures" / fixture_file	1✔
48	extraction: ExtractionResult = ExtractionResult.model_validate_json(	1✔
49	fixture_path.read_text()
50	) # type: ignore[assignment]
51
52	if not extraction.pages:	1✔
53	raise ValueError(f"No pages found in {fixture_file}")	×
54
55	return extraction.pages	1✔
56
57
58	# TODO A lot of the methods in ClassifiedPage overlap with ClassificationResult
59
60
61	class ClassifiedPage:	1✔
62	"""Wrapper around PageData providing convenient access to classified elements.
63
64	This class provides helper methods to query elements by label type and
65	supports hierarchical queries (e.g., finding children inside parent bboxes).
66	Results are cached for efficiency.
67	"""
68
69	def __init__(self, page: PageData, result: ClassificationResult):	1✔
70	"""Initialize with a classified PageData and its result.
71
72	Args:
73	page: PageData that has been run through classify_elements()
74	result: The ClassificationResult for this page
75	"""
76	self.page = page	1✔
77	self.result = result	1✔
78	self._cache: dict[str, list[Block]] = {}	1✔
79
80	def elements_by_label(	1✔
81	self, label: str, include_deleted: bool = False
82	) -> list[Block]:
83	"""Get all elements with the given label.
84
85	Args:
86	label: The label to filter by
87	include_deleted: Whether to include deleted elements
88
89	Returns:
90	List of elements with matching label
91	"""
92	cache_key = f"{label}:deleted={include_deleted}"	1✔
93	if cache_key not in self._cache:	1✔
94	if include_deleted:	1✔
95	self._cache[cache_key] = [	1✔
96	e for e in self.page.blocks if self.result.get_label(e) == label
97	]
98	else:
99	self._cache[cache_key] = [	1✔
100	e
101	for e in self.page.blocks
102	if self.result.get_label(e) == label
103	and not self.result.is_removed(e)
104	]
105	return self._cache[cache_key]	1✔
106
107	def parts_lists(self) -> list[Block]:	1✔
108	"""Get all non-deleted parts_list elements."""
109	return self.elements_by_label("parts_list")	1✔
110
111	def part_images(self) -> list[Block]:	1✔
112	"""Get all non-deleted part_image elements."""
113	return self.elements_by_label("part_image")	1✔
114
115	def part_counts(self) -> list[Block]:	1✔
116	"""Get all non-deleted part_count elements."""
117	return self.elements_by_label("part_count")	1✔
118
119	def step_numbers(self) -> list[Block]:	1✔
120	"""Get all non-deleted step_number elements."""
121	return self.elements_by_label("step_number")	×
122
123	def children_of(self, parent: Block, label: str \| None = None) -> list[Block]:	1✔
124	"""Return all non-deleted elements spatially contained within a parent element.
125
126	Note: This uses bbox containment, not ElementTree hierarchy, because
127	the hierarchy is based on "smallest containing bbox" which means there
128	may be intermediate unlabeled elements between a parent and its
129	logical children. For validation rules about spatial containment,
130	bbox checking is more appropriate.
131
132	Args:
133	parent: The parent element to search within
134	label: Optional label filter (e.g., "part_image")
135
136	Returns:
137	List of non-deleted Elements matching the label (if specified) that
138	are fully contained within the parent's bbox
139	"""
140	# Use spatial containment, not hierarchy
141	result = []	1✔
142	for elem in self.page.blocks:	1✔
143	if id(elem) in self.result.removal_reasons:	1✔
144	continue	×
145	if label is not None and self.result.get_label(elem) != label:	1✔
146	continue	1✔
147	if elem.bbox.fully_inside(parent.bbox):	1✔
148	result.append(elem)	1✔
149	return result	1✔
150
151	def print_summary(self, logger: logging.Logger \| None = None) -> None:	1✔
152	"""Log a summary of labeled elements.
153
154	Args:
155	logger: Logger to use (defaults to module logger)
156	"""
157	logger = logger or log	1✔
158	label_counts = defaultdict(int)	1✔
159	for e in self.page.blocks:	1✔
160	label = (	1✔
161	self.result.get_label(e) if self.result.get_label(e) else "<unknown>"
162	)
163	label_counts[label] += 1	1✔
164
165	logger.info(f"Label counts: {dict(label_counts)}")	1✔
166
167
168	# TODO Replace this with just results.get_blocks_by_label()
169
170
171	def _parts_lists(page: PageData, result: ClassificationResult) -> list[Block]:	1✔
172	return [	×
173	e
174	for e in page.blocks
175	if result.get_label(e) == "parts_list" and not result.is_removed(e)
176	]
177
178
179	# TODO Replace this with just results.get_blocks_by_label()
180
181
182	def _part_images(page: PageData, result: ClassificationResult) -> list[Block]:	1✔
183	return [	×
184	e
185	for e in page.blocks
186	if result.get_label(e) == "part_image" and not result.is_removed(e)
187	]
188
189
190	# TODO Replace this with just results.get_blocks_by_label()
191
192
193	def _part_counts(page: PageData, result: ClassificationResult) -> list[Block]:	1✔
194	return [	×
195	e
196	for e in page.blocks
197	if result.get_label(e) == "part_count" and not result.is_removed(e)
198	]
199
200
201	def _print_label_counts(page: PageData, result: ClassificationResult) -> None:	1✔
202	label_counts = defaultdict(int)	×
203	for e in page.blocks:	×
204	label = result.get_label(e) if result.get_label(e) else "<unknown>"	×
205	label_counts[label] += 1	×
206
207	# TODO The following logging shows "defaultdict(<class 'int'>,..." figure
208	# out how to avoid that.
209	log.info(f"Label counts: {label_counts}")	×
210
211
212	class TestClassifierRules:	1✔
213	"""End-to-end rules that must hold on real pages after classification."""
214
215	@pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)	1✔
216	def test_parts_list_contains_at_least_one_part_image(	1✔
217	self, fixture_file: str
218	) -> None:
219	"""Every labeled parts list should include at least one part image
220	inside its bbox.
221
222	This test runs on all JSON fixtures in the fixtures/ directory.
223	"""
224
225	pages = _load_pages_from_fixture(fixture_file)	1✔
226
227	for page_idx, page in enumerate(pages):	1✔
228	# Run the full classification pipeline on the page
229	result = classify_elements(page)	1✔
230
231	classified = ClassifiedPage(page, result)	1✔
232	classified.print_summary()	1✔
233
234	parts_lists = classified.parts_lists()	1✔
235	part_images = classified.part_images()	1✔
236	part_counts = classified.part_counts()	1✔
237
238	# Debug: show all part_image labeled elements including deleted ones
239	all_part_images = classified.elements_by_label(	1✔
240	"part_image", include_deleted=True
241	)
242	log.info(	1✔
243	f"Page {page_idx}: Total on page: {len(parts_lists)} parts_lists, "
244	f"{len(part_images)} part_images (non-deleted), "
245	f"{len(all_part_images)} total part_images, "
246	f"{len(part_counts)} part_counts"
247	)
248	if len(all_part_images) != len(part_images):	1✔
249	deleted_count = len(all_part_images) - len(part_images)	×
250	log.warning(	×
251	f" WARNING: {deleted_count} part_images are DELETED on this page"
252	)
253	for img in all_part_images:	×
254	if result.is_removed(img):	×
255	# Check if it's inside any parts_list
256	inside_any = any(	×
257	img.bbox.fully_inside(pl.bbox) for pl in parts_lists
258	)
259	location = (	×
260	"inside a parts_list"
261	if inside_any
262	else "outside all parts_lists"
263	)
264	log.warning(	×
265	f" - Deleted PartImage id:{img.id} "
266	f"bbox:{img.bbox} ({location})"
267	)
268
269	for parts_list in parts_lists:	1✔
270	part_images_inside = classified.children_of(	1✔
271	parts_list, label="part_image"
272	)
273	part_counts_inside = classified.children_of(	1✔
274	parts_list, label="part_count"
275	)
276
277	# Also get ALL part_images (including deleted) to check for deletion bugs
278	all_part_images_inside = []	1✔
279	for elem in page.blocks:	1✔
280	if result.get_label(	1✔
281	elem
282	) == "part_image" and elem.bbox.fully_inside(parts_list.bbox):
283	all_part_images_inside.append(elem)	1✔
284
285	log.info(	1✔
286	f"{fixture_file} page {page_idx} PartsList id:{parts_list.id} "
287	f"bbox:{parts_list.bbox} contains:"
288	)
289	for img in part_images_inside:	1✔
290	log.info(f" - PartImage id:{img.id} bbox:{img.bbox}")	1✔
291	for count in part_counts_inside:	1✔
292	count_text = count.text if isinstance(count, Text) else ""	1✔
293	log.info(	1✔
294	f" - PartCount id:{count.id} text:{count_text} bbox:{count.bbox}"
295	)
296
297	# Log deleted part_images if any
298	deleted_images = [	1✔
299	img for img in all_part_images_inside if result.is_removed(img)
300	]
301	if deleted_images:	1✔
302	log.warning(	×
303	f" WARNING: {len(deleted_images)} part_images DELETED "
304	f"inside parts_list {parts_list.id}:"
305	)
306	for img in deleted_images:	×
307	log.warning(	×
308	f" - PartImage id:{img.id} bbox:{img.bbox} [DELETED]"
309	)
310
311	# Debug: log all part images to see why they're not inside
312	if len(part_images_inside) == 0:	1✔
313	log.info(" DEBUG: All part_images on page:")	×
314	for img in part_images:	×
315	log.info(	×
316	f" - PartImage id:{img.id} bbox:{img.bbox} "
317	f"inside:{img.bbox.fully_inside(parts_list.bbox)}"
318	)
319	# Each parts_list must contain at least one part_image fully inside its bbox
320	assert len(part_images_inside) >= 1, (	1✔
321	f"Parts list {parts_list.id} in {fixture_file} page {page_idx} "
322	f"should contain at least one part image"
323	)
324
325	# No part_images inside a parts_list should be deleted
326	assert len(deleted_images) == 0, (	1✔
327	f"Parts list {parts_list.id} in {fixture_file} page {page_idx} has "
328	f"{len(deleted_images)} deleted part_images inside it (should be 0)"
329	)
330
331	# Each parts_list must contain the same number of part_counts as
332	# part_images inside it
333	assert len(part_counts_inside) == len(part_images_inside), (	1✔
334	f"PartsList id:{parts_list.id} in {fixture_file} page {page_idx} "
335	f"should contain {len(part_images_inside)} PartCounts, "
336	f"found {len(part_counts_inside)}"
337	)
338
339	@pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)	1✔
340	def test_parts_lists_do_not_overlap(self, fixture_file: str) -> None:	1✔
341	"""No two parts lists should overlap.
342
343	Parts lists represent distinct areas of the page and should not
344	have overlapping bounding boxes.
345	"""
346	pages = _load_pages_from_fixture(fixture_file)	1✔
347
348	for page_idx, page in enumerate(pages):	1✔
349	# Run the full classification pipeline on the page
350	result = classify_elements(page)	1✔
351
352	classified = ClassifiedPage(page, result)	1✔
353	parts_lists = classified.parts_lists()	1✔
354
355	# Check all pairs of parts lists for overlap
356	for i, parts_list_a in enumerate(parts_lists):	1✔
357	for parts_list_b in parts_lists[i + 1 :]:	1✔
358	assert not parts_list_a.bbox.overlaps(parts_list_b.bbox), (	1✔
359	f"Parts lists {parts_list_a.id} (bbox:{parts_list_a.bbox}) and "
360	f"{parts_list_b.id} (bbox:{parts_list_b.bbox}) in "
361	f"{fixture_file} page {page_idx} overlap"
362	)
363
364	@pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)	1✔
365	def test_each_part_image_is_inside_a_parts_list(self, fixture_file: str) -> None:	1✔
366	"""Each part image must be inside at least one parts list.
367
368	Every part_image should be contained within a parts_list's bounding box.
369	"""
370	pages = _load_pages_from_fixture(fixture_file)	1✔
371
372	for page_idx, page in enumerate(pages):	1✔
373	# Run the full classification pipeline on the page
374	result = classify_elements(page)	1✔
375
376	classified = ClassifiedPage(page, result)	1✔
377	parts_lists = classified.parts_lists()	1✔
378	part_images = classified.part_images()	1✔
379
380	for part_image in part_images:	1✔
381	# Check if this part_image is inside at least one parts_list
382	inside_any_parts_list = any(	1✔
383	part_image.bbox.fully_inside(pl.bbox) for pl in parts_lists
384	)
385
386	assert inside_any_parts_list, (	1✔
387	f"Part image {part_image.id} (bbox:{part_image.bbox}) in "
388	f"{fixture_file} page {page_idx} is not inside any parts_list"
389	)
390
391	@pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)	1✔
392	def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:	1✔
393	"""No element with a label should be marked as deleted.
394
395	If an element has been classified with a label, it should not be deleted.
396	This ensures that the classification and deletion logic don't conflict.
397	"""
398	pages = _load_pages_from_fixture(fixture_file)	1✔
399
400	for page_idx, page in enumerate(pages):	1✔
401	# Run the full classification pipeline on the page
402	result = classify_elements(page)	1✔
403
404	# Find all elements that are both labeled and deleted
405	labeled_and_deleted = []	1✔
406	for elem in page.blocks:	1✔
407	if result.get_label(elem) is not None and result.is_removed(elem):	1✔
408	labeled_and_deleted.append(elem)	×
409
410	if labeled_and_deleted:	1✔
411	log.error(	×
412	f"Found {len(labeled_and_deleted)} labeled elements that are deleted:"
413	)
414	for elem in labeled_and_deleted:	×
415	log.error(	×
416	f" - {result.get_label(elem)} id:{elem.id} "
417	f"bbox:{elem.bbox} [DELETED]"
418	)
419
420	assert len(labeled_and_deleted) == 0, (	1✔
421	f"Found {len(labeled_and_deleted)} labeled elements that are "
422	f"deleted in {fixture_file} page {page_idx}. "
423	f"Labeled elements should not be deleted."
424	)
425
426	@pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)	1✔
427	def test_each_element_has_at_most_one_winner(self, fixture_file: str) -> None:	1✔
428	"""Each element should have at most one winner candidate across all labels.
429
430	An element can have multiple candidates across different labels, but only
431	one of them should be marked as a winner. This ensures classification
432	decisions are unambiguous.
433	"""
434	pages = _load_pages_from_fixture(fixture_file)	1✔
435
436	for page_idx, page in enumerate(pages):	1✔
437	# Run the full classification pipeline on the page
438	result = classify_elements(page)	1✔
439
440	# Track which blocks have won, and for which label
441	block_to_winning_label: dict[int, str] = {}	1✔
442
443	# Check all candidates across all labels
444	all_candidates = result.get_all_candidates()	1✔
445	for label, candidates in all_candidates.items():	1✔
446	for candidate in candidates:	1✔
447	if not candidate.is_winner:	1✔
448	continue	1✔
449
450	# Skip synthetic candidates (no source block)
451	if candidate.source_block is None:	1✔
452	continue	1✔
453
454	block_id = candidate.source_block.id	1✔
455
456	# Check if this block already has a winner
457	if block_id in block_to_winning_label:	1✔
458	existing_label = block_to_winning_label[block_id]	×
459	pytest.fail(	×
460	f"Block {block_id} in {fixture_file} page {page_idx} has multiple "
461	f"winner candidates: '{existing_label}' and '{label}'. "
462	"Each block should have at most one winner."
463	)
464
465	block_to_winning_label[block_id] = label	1✔
466
467	@pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)	1✔
468	def test_all_winners_discoverable_from_page(self, fixture_file: str) -> None:	1✔
469	"""All winning candidates should be discoverable from the root Page element.
470
471	This test ensures that every winning candidate (constructed LegoPageElement)
472	can be found by traversing the Page hierarchy. This validates that:
473	1. The page builder properly includes all winners in the hierarchy
474	2. No winning candidates are orphaned or lost during page construction
475	3. The hierarchical structure is complete
476
477	Note: Some pages are skipped due to known issues in the page builder:
478	- Catalog/inventory pages (not yet supported)
479	- Pages with orphaned step_numbers or part_counts (bugs to fix)
480
481	TODO: Fix page builder bugs and remove pages from KNOWN_ISSUES skip list.
482	"""
483	# Skip known pages with bugs in the page builder
484	# TODO: Fix these bugs and remove from skip list
485	KNOWN_ISSUES = [	1✔
486	# Catalog/inventory pages with no steps - page builder doesn't support yet
487	"6509377_page_180_raw.json", # 178 orphaned parts/part_counts
488	# Regular instruction pages with orphaned winners - bugs to fix
489	"6509377_page_010_raw.json", # 1 orphaned step_number
490	"6509377_page_013_raw.json", # 2 orphaned part_counts
491	"6509377_page_014_raw.json", # 1 orphaned part_count
492	"6509377_page_015_raw.json", # 1 orphaned part_count
493	]
494
495	if fixture_file in KNOWN_ISSUES:	1✔
496	pytest.skip(f"Skipping {fixture_file}: known page builder issue")	1✔
497
498	pages = _load_pages_from_fixture(fixture_file)	1✔
499
500	for page_idx, page_data in enumerate(pages):	1✔
501	# Run classification
502	result = classify_elements(page_data)	1✔
503
504	# Build the Page hierarchy
505	page = result.page	1✔
506	if page is None:	1✔
507	pytest.fail(f"Page element is None for {fixture_file} page {page_idx}")	×
508
509	# Collect all constructed elements from the Page hierarchy
510	discovered_elements: set[int] = set()	1✔
511	stack: list[LegoPageElement] = [page]	1✔
512
513	while stack:	1✔
514	element = stack.pop()	1✔
515	discovered_elements.add(id(element))	1✔
516
517	# Page attributes
518	if isinstance(element, Page):	1✔
519	if element.page_number:	1✔
520	stack.append(element.page_number)	1✔
521	if element.progress_bar:	1✔
522	stack.append(element.progress_bar)	1✔
523	stack.extend(element.steps)	1✔
524
525	# Step attributes (all required fields)
526	elif isinstance(element, Step):	1✔
527	stack.append(element.step_number)	1✔
528	stack.append(element.parts_list)	1✔
529	stack.append(element.diagram)	1✔
530
531	# PartsList attributes
532	elif isinstance(element, PartsList):	1✔
533	stack.extend(element.parts)	1✔
534
535	# Part attributes
536	# Note: Part.count is PartCount (LegoPageElement)
537	# Part.diagram is Drawing \| None (not LegoPageElement, so skip it)
538	elif isinstance(element, Part):	1✔
539	stack.append(element.count)	1✔
540
541	# Get all winning candidates (all types, not just structural)
542	all_candidates = result.get_all_candidates()	1✔
543	winning_candidates = []	1✔
544	for label, candidates in all_candidates.items():	1✔
545	for candidate in candidates:	1✔
546	if candidate.is_winner and candidate.constructed is not None:	1✔
547	winning_candidates.append((label, candidate))	1✔
548
549	# Check that all winners are discoverable
550	orphaned = []	1✔
551	for label, candidate in winning_candidates:	1✔
552	element_id = id(candidate.constructed)	1✔
553	if element_id not in discovered_elements:	1✔
554	orphaned.append((label, candidate))	×
555
556	if orphaned:	1✔
557	log.error(	×
558	f"Found {len(orphaned)} winning candidates not discoverable "
559	f"from Page in {fixture_file} page {page_idx}:"
560	)
561	for label, candidate in orphaned:	×
562	log.error(	×
563	f" - {label}: {candidate.constructed} "
564	f"(id={id(candidate.constructed)})"
565	)
566
567	assert len(orphaned) == 0, (	1✔
568	f"Found {len(orphaned)} winning candidates that are not "
569	f"discoverable from the root Page element in {fixture_file} "
570	f"page {page_idx}. All winners should be part of the "
571	f"hierarchical structure."
572	)

bramp / build-along / 19257583787

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous