• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19257583787

11 Nov 2025 06:52AM UTC coverage: 91.217% (+0.5%) from 90.748%
19257583787

push

github

bramp
feat(pdf_extract): Update lego_page_layout tool

- Add support for ProgressBar and PartNumber elements.
- Remove NewBag and BagNumber from the example.
- Adjust ProgressBar to be left-aligned with steps and have a margin.
- Regenerate the layout diagram.

4923 of 5397 relevant lines covered (91.22%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

83.33
/src/build_a_long/pdf_extract/classifier/classifier_rules_test.py
1
"""Rule-based tests over real fixtures for the PDF element classifier.
2

3
This suite validates high-level invariants that must hold after classification.
4

5
Rules covered:
6
- Every parts list must contain at least one part image inside it.
7
- No two parts lists overlap.
8
- Each part image is inside a parts list.
9
- Each element has at most one winner candidate.
10

11
Real fixture(s) live under this package's fixtures/ directory.
12
"""
13

14
import logging
1✔
15
from collections import defaultdict
1✔
16
from pathlib import Path
1✔
17

18
import pytest
1✔
19

20
from build_a_long.pdf_extract.classifier import ClassificationResult, classify_elements
1✔
21
from build_a_long.pdf_extract.extractor import ExtractionResult, PageData
1✔
22
from build_a_long.pdf_extract.extractor.lego_page_elements import (
1✔
23
    LegoPageElement,
24
    Page,
25
    Part,
26
    PartsList,
27
    Step,
28
)
29
from build_a_long.pdf_extract.extractor.page_blocks import Block, Text
1✔
30
from build_a_long.pdf_extract.fixtures import RAW_FIXTURE_FILES
1✔
31

32
log = logging.getLogger(__name__)
1✔
33

34

35
def _load_pages_from_fixture(fixture_file: str) -> list[PageData]:
1✔
36
    """Load all pages from a fixture file.
37

38
    Args:
39
        fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')
40

41
    Returns:
42
        All pages from the extraction result
43

44
    Raises:
45
        ValueError: If the fixture contains no pages
46
    """
47
    fixture_path = Path(__file__).parent.parent / "fixtures" / fixture_file
1✔
48
    extraction: ExtractionResult = ExtractionResult.model_validate_json(
1✔
49
        fixture_path.read_text()
50
    )  # type: ignore[assignment]
51

52
    if not extraction.pages:
1✔
53
        raise ValueError(f"No pages found in {fixture_file}")
×
54

55
    return extraction.pages
1✔
56

57

58
# TODO A lot of the methods in ClassifiedPage overlap with ClassificationResult
59

60

61
class ClassifiedPage:
1✔
62
    """Wrapper around PageData providing convenient access to classified elements.
63

64
    This class provides helper methods to query elements by label type and
65
    supports hierarchical queries (e.g., finding children inside parent bboxes).
66
    Results are cached for efficiency.
67
    """
68

69
    def __init__(self, page: PageData, result: ClassificationResult):
1✔
70
        """Initialize with a classified PageData and its result.
71

72
        Args:
73
            page: PageData that has been run through classify_elements()
74
            result: The ClassificationResult for this page
75
        """
76
        self.page = page
1✔
77
        self.result = result
1✔
78
        self._cache: dict[str, list[Block]] = {}
1✔
79

80
    def elements_by_label(
1✔
81
        self, label: str, include_deleted: bool = False
82
    ) -> list[Block]:
83
        """Get all elements with the given label.
84

85
        Args:
86
            label: The label to filter by
87
            include_deleted: Whether to include deleted elements
88

89
        Returns:
90
            List of elements with matching label
91
        """
92
        cache_key = f"{label}:deleted={include_deleted}"
1✔
93
        if cache_key not in self._cache:
1✔
94
            if include_deleted:
1✔
95
                self._cache[cache_key] = [
1✔
96
                    e for e in self.page.blocks if self.result.get_label(e) == label
97
                ]
98
            else:
99
                self._cache[cache_key] = [
1✔
100
                    e
101
                    for e in self.page.blocks
102
                    if self.result.get_label(e) == label
103
                    and not self.result.is_removed(e)
104
                ]
105
        return self._cache[cache_key]
1✔
106

107
    def parts_lists(self) -> list[Block]:
1✔
108
        """Get all non-deleted parts_list elements."""
109
        return self.elements_by_label("parts_list")
1✔
110

111
    def part_images(self) -> list[Block]:
1✔
112
        """Get all non-deleted part_image elements."""
113
        return self.elements_by_label("part_image")
1✔
114

115
    def part_counts(self) -> list[Block]:
1✔
116
        """Get all non-deleted part_count elements."""
117
        return self.elements_by_label("part_count")
1✔
118

119
    def step_numbers(self) -> list[Block]:
1✔
120
        """Get all non-deleted step_number elements."""
121
        return self.elements_by_label("step_number")
×
122

123
    def children_of(self, parent: Block, label: str | None = None) -> list[Block]:
1✔
124
        """Return all non-deleted elements spatially contained within a parent element.
125

126
        Note: This uses bbox containment, not ElementTree hierarchy, because
127
        the hierarchy is based on "smallest containing bbox" which means there
128
        may be intermediate unlabeled elements between a parent and its
129
        logical children. For validation rules about spatial containment,
130
        bbox checking is more appropriate.
131

132
        Args:
133
            parent: The parent element to search within
134
            label: Optional label filter (e.g., "part_image")
135

136
        Returns:
137
            List of non-deleted Elements matching the label (if specified) that
138
            are fully contained within the parent's bbox
139
        """
140
        # Use spatial containment, not hierarchy
141
        result = []
1✔
142
        for elem in self.page.blocks:
1✔
143
            if id(elem) in self.result.removal_reasons:
1✔
144
                continue
×
145
            if label is not None and self.result.get_label(elem) != label:
1✔
146
                continue
1✔
147
            if elem.bbox.fully_inside(parent.bbox):
1✔
148
                result.append(elem)
1✔
149
        return result
1✔
150

151
    def print_summary(self, logger: logging.Logger | None = None) -> None:
1✔
152
        """Log a summary of labeled elements.
153

154
        Args:
155
            logger: Logger to use (defaults to module logger)
156
        """
157
        logger = logger or log
1✔
158
        label_counts = defaultdict(int)
1✔
159
        for e in self.page.blocks:
1✔
160
            label = (
1✔
161
                self.result.get_label(e) if self.result.get_label(e) else "<unknown>"
162
            )
163
            label_counts[label] += 1
1✔
164

165
        logger.info(f"Label counts: {dict(label_counts)}")
1✔
166

167

168
# TODO Replace this with just results.get_blocks_by_label()
169

170

171
def _parts_lists(page: PageData, result: ClassificationResult) -> list[Block]:
1✔
172
    return [
×
173
        e
174
        for e in page.blocks
175
        if result.get_label(e) == "parts_list" and not result.is_removed(e)
176
    ]
177

178

179
# TODO Replace this with just results.get_blocks_by_label()
180

181

182
def _part_images(page: PageData, result: ClassificationResult) -> list[Block]:
1✔
183
    return [
×
184
        e
185
        for e in page.blocks
186
        if result.get_label(e) == "part_image" and not result.is_removed(e)
187
    ]
188

189

190
# TODO Replace this with just results.get_blocks_by_label()
191

192

193
def _part_counts(page: PageData, result: ClassificationResult) -> list[Block]:
1✔
194
    return [
×
195
        e
196
        for e in page.blocks
197
        if result.get_label(e) == "part_count" and not result.is_removed(e)
198
    ]
199

200

201
def _print_label_counts(page: PageData, result: ClassificationResult) -> None:
1✔
202
    label_counts = defaultdict(int)
×
203
    for e in page.blocks:
×
204
        label = result.get_label(e) if result.get_label(e) else "<unknown>"
×
205
        label_counts[label] += 1
×
206

207
    # TODO The following logging shows "defaultdict(<class 'int'>,..." figure
208
    # out how to avoid that.
209
    log.info(f"Label counts: {label_counts}")
×
210

211

212
class TestClassifierRules:
1✔
213
    """End-to-end rules that must hold on real pages after classification."""
214

215
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
216
    def test_parts_list_contains_at_least_one_part_image(
1✔
217
        self, fixture_file: str
218
    ) -> None:
219
        """Every labeled parts list should include at least one part image
220
        inside its bbox.
221

222
        This test runs on all JSON fixtures in the fixtures/ directory.
223
        """
224

225
        pages = _load_pages_from_fixture(fixture_file)
1✔
226

227
        for page_idx, page in enumerate(pages):
1✔
228
            # Run the full classification pipeline on the page
229
            result = classify_elements(page)
1✔
230

231
            classified = ClassifiedPage(page, result)
1✔
232
            classified.print_summary()
1✔
233

234
            parts_lists = classified.parts_lists()
1✔
235
            part_images = classified.part_images()
1✔
236
            part_counts = classified.part_counts()
1✔
237

238
            # Debug: show all part_image labeled elements including deleted ones
239
            all_part_images = classified.elements_by_label(
1✔
240
                "part_image", include_deleted=True
241
            )
242
            log.info(
1✔
243
                f"Page {page_idx}: Total on page: {len(parts_lists)} parts_lists, "
244
                f"{len(part_images)} part_images (non-deleted), "
245
                f"{len(all_part_images)} total part_images, "
246
                f"{len(part_counts)} part_counts"
247
            )
248
            if len(all_part_images) != len(part_images):
1✔
249
                deleted_count = len(all_part_images) - len(part_images)
×
250
                log.warning(
×
251
                    f"  WARNING: {deleted_count} part_images are DELETED on this page"
252
                )
253
                for img in all_part_images:
×
254
                    if result.is_removed(img):
×
255
                        # Check if it's inside any parts_list
256
                        inside_any = any(
×
257
                            img.bbox.fully_inside(pl.bbox) for pl in parts_lists
258
                        )
259
                        location = (
×
260
                            "inside a parts_list"
261
                            if inside_any
262
                            else "outside all parts_lists"
263
                        )
264
                        log.warning(
×
265
                            f"    - Deleted PartImage id:{img.id} "
266
                            f"bbox:{img.bbox} ({location})"
267
                        )
268

269
            for parts_list in parts_lists:
1✔
270
                part_images_inside = classified.children_of(
1✔
271
                    parts_list, label="part_image"
272
                )
273
                part_counts_inside = classified.children_of(
1✔
274
                    parts_list, label="part_count"
275
                )
276

277
                # Also get ALL part_images (including deleted) to check for deletion bugs
278
                all_part_images_inside = []
1✔
279
                for elem in page.blocks:
1✔
280
                    if result.get_label(
1✔
281
                        elem
282
                    ) == "part_image" and elem.bbox.fully_inside(parts_list.bbox):
283
                        all_part_images_inside.append(elem)
1✔
284

285
                log.info(
1✔
286
                    f"{fixture_file} page {page_idx} PartsList id:{parts_list.id} "
287
                    f"bbox:{parts_list.bbox} contains:"
288
                )
289
                for img in part_images_inside:
1✔
290
                    log.info(f" - PartImage id:{img.id} bbox:{img.bbox}")
1✔
291
                for count in part_counts_inside:
1✔
292
                    count_text = count.text if isinstance(count, Text) else ""
1✔
293
                    log.info(
1✔
294
                        f" - PartCount id:{count.id} text:{count_text} bbox:{count.bbox}"
295
                    )
296

297
                # Log deleted part_images if any
298
                deleted_images = [
1✔
299
                    img for img in all_part_images_inside if result.is_removed(img)
300
                ]
301
                if deleted_images:
1✔
302
                    log.warning(
×
303
                        f"  WARNING: {len(deleted_images)} part_images DELETED "
304
                        f"inside parts_list {parts_list.id}:"
305
                    )
306
                    for img in deleted_images:
×
307
                        log.warning(
×
308
                            f"    - PartImage id:{img.id} bbox:{img.bbox} [DELETED]"
309
                        )
310

311
                # Debug: log all part images to see why they're not inside
312
                if len(part_images_inside) == 0:
1✔
313
                    log.info("  DEBUG: All part_images on page:")
×
314
                    for img in part_images:
×
315
                        log.info(
×
316
                            f"  - PartImage id:{img.id} bbox:{img.bbox} "
317
                            f"inside:{img.bbox.fully_inside(parts_list.bbox)}"
318
                        )
319
                # Each parts_list must contain at least one part_image fully inside its bbox
320
                assert len(part_images_inside) >= 1, (
1✔
321
                    f"Parts list {parts_list.id} in {fixture_file} page {page_idx} "
322
                    f"should contain at least one part image"
323
                )
324

325
                # No part_images inside a parts_list should be deleted
326
                assert len(deleted_images) == 0, (
1✔
327
                    f"Parts list {parts_list.id} in {fixture_file} page {page_idx} has "
328
                    f"{len(deleted_images)} deleted part_images inside it (should be 0)"
329
                )
330

331
                # Each parts_list must contain the same number of part_counts as
332
                # part_images inside it
333
                assert len(part_counts_inside) == len(part_images_inside), (
1✔
334
                    f"PartsList id:{parts_list.id} in {fixture_file} page {page_idx} "
335
                    f"should contain {len(part_images_inside)} PartCounts, "
336
                    f"found {len(part_counts_inside)}"
337
                )
338

339
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
340
    def test_parts_lists_do_not_overlap(self, fixture_file: str) -> None:
1✔
341
        """No two parts lists should overlap.
342

343
        Parts lists represent distinct areas of the page and should not
344
        have overlapping bounding boxes.
345
        """
346
        pages = _load_pages_from_fixture(fixture_file)
1✔
347

348
        for page_idx, page in enumerate(pages):
1✔
349
            # Run the full classification pipeline on the page
350
            result = classify_elements(page)
1✔
351

352
            classified = ClassifiedPage(page, result)
1✔
353
            parts_lists = classified.parts_lists()
1✔
354

355
            # Check all pairs of parts lists for overlap
356
            for i, parts_list_a in enumerate(parts_lists):
1✔
357
                for parts_list_b in parts_lists[i + 1 :]:
1✔
358
                    assert not parts_list_a.bbox.overlaps(parts_list_b.bbox), (
1✔
359
                        f"Parts lists {parts_list_a.id} (bbox:{parts_list_a.bbox}) and "
360
                        f"{parts_list_b.id} (bbox:{parts_list_b.bbox}) in "
361
                        f"{fixture_file} page {page_idx} overlap"
362
                    )
363

364
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
365
    def test_each_part_image_is_inside_a_parts_list(self, fixture_file: str) -> None:
1✔
366
        """Each part image must be inside at least one parts list.
367

368
        Every part_image should be contained within a parts_list's bounding box.
369
        """
370
        pages = _load_pages_from_fixture(fixture_file)
1✔
371

372
        for page_idx, page in enumerate(pages):
1✔
373
            # Run the full classification pipeline on the page
374
            result = classify_elements(page)
1✔
375

376
            classified = ClassifiedPage(page, result)
1✔
377
            parts_lists = classified.parts_lists()
1✔
378
            part_images = classified.part_images()
1✔
379

380
            for part_image in part_images:
1✔
381
                # Check if this part_image is inside at least one parts_list
382
                inside_any_parts_list = any(
1✔
383
                    part_image.bbox.fully_inside(pl.bbox) for pl in parts_lists
384
                )
385

386
                assert inside_any_parts_list, (
1✔
387
                    f"Part image {part_image.id} (bbox:{part_image.bbox}) in "
388
                    f"{fixture_file} page {page_idx} is not inside any parts_list"
389
                )
390

391
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
392
    def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:
1✔
393
        """No element with a label should be marked as deleted.
394

395
        If an element has been classified with a label, it should not be deleted.
396
        This ensures that the classification and deletion logic don't conflict.
397
        """
398
        pages = _load_pages_from_fixture(fixture_file)
1✔
399

400
        for page_idx, page in enumerate(pages):
1✔
401
            # Run the full classification pipeline on the page
402
            result = classify_elements(page)
1✔
403

404
            # Find all elements that are both labeled and deleted
405
            labeled_and_deleted = []
1✔
406
            for elem in page.blocks:
1✔
407
                if result.get_label(elem) is not None and result.is_removed(elem):
1✔
408
                    labeled_and_deleted.append(elem)
×
409

410
            if labeled_and_deleted:
1✔
411
                log.error(
×
412
                    f"Found {len(labeled_and_deleted)} labeled elements that are deleted:"
413
                )
414
                for elem in labeled_and_deleted:
×
415
                    log.error(
×
416
                        f"  - {result.get_label(elem)} id:{elem.id} "
417
                        f"bbox:{elem.bbox} [DELETED]"
418
                    )
419

420
            assert len(labeled_and_deleted) == 0, (
1✔
421
                f"Found {len(labeled_and_deleted)} labeled elements that are "
422
                f"deleted in {fixture_file} page {page_idx}. "
423
                f"Labeled elements should not be deleted."
424
            )
425

426
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
427
    def test_each_element_has_at_most_one_winner(self, fixture_file: str) -> None:
1✔
428
        """Each element should have at most one winner candidate across all labels.
429

430
        An element can have multiple candidates across different labels, but only
431
        one of them should be marked as a winner. This ensures classification
432
        decisions are unambiguous.
433
        """
434
        pages = _load_pages_from_fixture(fixture_file)
1✔
435

436
        for page_idx, page in enumerate(pages):
1✔
437
            # Run the full classification pipeline on the page
438
            result = classify_elements(page)
1✔
439

440
            # Track which blocks have won, and for which label
441
            block_to_winning_label: dict[int, str] = {}
1✔
442

443
            # Check all candidates across all labels
444
            all_candidates = result.get_all_candidates()
1✔
445
            for label, candidates in all_candidates.items():
1✔
446
                for candidate in candidates:
1✔
447
                    if not candidate.is_winner:
1✔
448
                        continue
1✔
449

450
                    # Skip synthetic candidates (no source block)
451
                    if candidate.source_block is None:
1✔
452
                        continue
1✔
453

454
                    block_id = candidate.source_block.id
1✔
455

456
                    # Check if this block already has a winner
457
                    if block_id in block_to_winning_label:
1✔
458
                        existing_label = block_to_winning_label[block_id]
×
459
                        pytest.fail(
×
460
                            f"Block {block_id} in {fixture_file} page {page_idx} has multiple "
461
                            f"winner candidates: '{existing_label}' and '{label}'. "
462
                            "Each block should have at most one winner."
463
                        )
464

465
                    block_to_winning_label[block_id] = label
1✔
466

467
    @pytest.mark.parametrize("fixture_file", RAW_FIXTURE_FILES)
1✔
468
    def test_all_winners_discoverable_from_page(self, fixture_file: str) -> None:
1✔
469
        """All winning candidates should be discoverable from the root Page element.
470

471
        This test ensures that every winning candidate (constructed LegoPageElement)
472
        can be found by traversing the Page hierarchy. This validates that:
473
        1. The page builder properly includes all winners in the hierarchy
474
        2. No winning candidates are orphaned or lost during page construction
475
        3. The hierarchical structure is complete
476

477
        Note: Some pages are skipped due to known issues in the page builder:
478
        - Catalog/inventory pages (not yet supported)
479
        - Pages with orphaned step_numbers or part_counts (bugs to fix)
480

481
        TODO: Fix page builder bugs and remove pages from KNOWN_ISSUES skip list.
482
        """
483
        # Skip known pages with bugs in the page builder
484
        # TODO: Fix these bugs and remove from skip list
485
        KNOWN_ISSUES = [
1✔
486
            # Catalog/inventory pages with no steps - page builder doesn't support yet
487
            "6509377_page_180_raw.json",  # 178 orphaned parts/part_counts
488
            # Regular instruction pages with orphaned winners - bugs to fix
489
            "6509377_page_010_raw.json",  # 1 orphaned step_number
490
            "6509377_page_013_raw.json",  # 2 orphaned part_counts
491
            "6509377_page_014_raw.json",  # 1 orphaned part_count
492
            "6509377_page_015_raw.json",  # 1 orphaned part_count
493
        ]
494

495
        if fixture_file in KNOWN_ISSUES:
1✔
496
            pytest.skip(f"Skipping {fixture_file}: known page builder issue")
1✔
497

498
        pages = _load_pages_from_fixture(fixture_file)
1✔
499

500
        for page_idx, page_data in enumerate(pages):
1✔
501
            # Run classification
502
            result = classify_elements(page_data)
1✔
503

504
            # Build the Page hierarchy
505
            page = result.page
1✔
506
            if page is None:
1✔
507
                pytest.fail(f"Page element is None for {fixture_file} page {page_idx}")
×
508

509
            # Collect all constructed elements from the Page hierarchy
510
            discovered_elements: set[int] = set()
1✔
511
            stack: list[LegoPageElement] = [page]
1✔
512

513
            while stack:
1✔
514
                element = stack.pop()
1✔
515
                discovered_elements.add(id(element))
1✔
516

517
                # Page attributes
518
                if isinstance(element, Page):
1✔
519
                    if element.page_number:
1✔
520
                        stack.append(element.page_number)
1✔
521
                    if element.progress_bar:
1✔
522
                        stack.append(element.progress_bar)
1✔
523
                    stack.extend(element.steps)
1✔
524

525
                # Step attributes (all required fields)
526
                elif isinstance(element, Step):
1✔
527
                    stack.append(element.step_number)
1✔
528
                    stack.append(element.parts_list)
1✔
529
                    stack.append(element.diagram)
1✔
530

531
                # PartsList attributes
532
                elif isinstance(element, PartsList):
1✔
533
                    stack.extend(element.parts)
1✔
534

535
                # Part attributes
536
                # Note: Part.count is PartCount (LegoPageElement)
537
                # Part.diagram is Drawing | None (not LegoPageElement, so skip it)
538
                elif isinstance(element, Part):
1✔
539
                    stack.append(element.count)
1✔
540

541
            # Get all winning candidates (all types, not just structural)
542
            all_candidates = result.get_all_candidates()
1✔
543
            winning_candidates = []
1✔
544
            for label, candidates in all_candidates.items():
1✔
545
                for candidate in candidates:
1✔
546
                    if candidate.is_winner and candidate.constructed is not None:
1✔
547
                        winning_candidates.append((label, candidate))
1✔
548

549
            # Check that all winners are discoverable
550
            orphaned = []
1✔
551
            for label, candidate in winning_candidates:
1✔
552
                element_id = id(candidate.constructed)
1✔
553
                if element_id not in discovered_elements:
1✔
554
                    orphaned.append((label, candidate))
×
555

556
            if orphaned:
1✔
557
                log.error(
×
558
                    f"Found {len(orphaned)} winning candidates not discoverable "
559
                    f"from Page in {fixture_file} page {page_idx}:"
560
                )
561
                for label, candidate in orphaned:
×
562
                    log.error(
×
563
                        f"  - {label}: {candidate.constructed} "
564
                        f"(id={id(candidate.constructed)})"
565
                    )
566

567
            assert len(orphaned) == 0, (
1✔
568
                f"Found {len(orphaned)} winning candidates that are not "
569
                f"discoverable from the root Page element in {fixture_file} "
570
                f"page {page_idx}. All winners should be part of the "
571
                f"hierarchical structure."
572
            )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc