• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19251794703

11 Nov 2025 01:25AM UTC coverage: 90.748% (+3.9%) from 86.822%
19251794703

push

github

bramp
Update golden files to reflect improved parts list classification

The parts_list_max_area_ratio filter now correctly rejects full-page
drawings (bbox: 0,0 to 552.76,496.06) that were previously incorrectly
classified as parts lists.

Updated golden files:
- 6509377_page_015_expected.json: Full-page drawing rejected, now uses
  actual parts list with proper bbox
- 6509377_page_180_expected.json: Full-page drawing rejected, now uses
  actual parts list with proper bbox

These changes reflect the correct behavior where drawings occupying
>75% of the page area are rejected as likely background elements.

4708 of 5188 relevant lines covered (90.75%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

81.48
/src/build_a_long/pdf_extract/classifier/classifier_rules_test.py
1
"""Rule-based tests over real fixtures for the PDF element classifier.
2

3
This suite validates high-level invariants that must hold after classification.
4

5
Rules covered:
6
- Every parts list must contain at least one part image inside it.
7
- No two parts lists overlap.
8
- Each part image is inside a parts list.
9
- Each element has at most one winner candidate.
10

11
Real fixture(s) live under this package's fixtures/ directory.
12
"""
13

14
import logging
1✔
15
from collections import defaultdict
1✔
16
from pathlib import Path
1✔
17

18
import pytest
1✔
19

20
from build_a_long.pdf_extract.classifier import ClassificationResult, classify_elements
1✔
21
from build_a_long.pdf_extract.extractor import ExtractionResult, PageData
1✔
22
from build_a_long.pdf_extract.extractor.page_blocks import Block, Text
1✔
23

24
log = logging.getLogger(__name__)
1✔
25

26

27
def _load_pages_from_fixture(fixture_file: str) -> list[PageData]:
1✔
28
    """Load all pages from a fixture file.
29

30
    Args:
31
        fixture_file: Name of the fixture file (e.g., '6509377_page_010_raw.json')
32

33
    Returns:
34
        All pages from the extraction result
35

36
    Raises:
37
        ValueError: If the fixture contains no pages
38
    """
39
    fixture_path = Path(__file__).parent.parent / "fixtures" / fixture_file
1✔
40
    extraction: ExtractionResult = ExtractionResult.model_validate_json(
1✔
41
        fixture_path.read_text()
42
    )  # type: ignore[assignment]
43

44
    if not extraction.pages:
1✔
45
        raise ValueError(f"No pages found in {fixture_file}")
×
46

47
    return extraction.pages
1✔
48

49

50
# TODO A lot of the methods in ClassifiedPage overlap with ClassificationResult
51

52

53
class ClassifiedPage:
1✔
54
    """Wrapper around PageData providing convenient access to classified elements.
55

56
    This class provides helper methods to query elements by label type and
57
    supports hierarchical queries (e.g., finding children inside parent bboxes).
58
    Results are cached for efficiency.
59
    """
60

61
    def __init__(self, page: PageData, result: ClassificationResult):
1✔
62
        """Initialize with a classified PageData and its result.
63

64
        Args:
65
            page: PageData that has been run through classify_elements()
66
            result: The ClassificationResult for this page
67
        """
68
        self.page = page
1✔
69
        self.result = result
1✔
70
        self._cache: dict[str, list[Block]] = {}
1✔
71

72
    def elements_by_label(
1✔
73
        self, label: str, include_deleted: bool = False
74
    ) -> list[Block]:
75
        """Get all elements with the given label.
76

77
        Args:
78
            label: The label to filter by
79
            include_deleted: Whether to include deleted elements
80

81
        Returns:
82
            List of elements with matching label
83
        """
84
        cache_key = f"{label}:deleted={include_deleted}"
1✔
85
        if cache_key not in self._cache:
1✔
86
            if include_deleted:
1✔
87
                self._cache[cache_key] = [
1✔
88
                    e for e in self.page.blocks if self.result.get_label(e) == label
89
                ]
90
            else:
91
                self._cache[cache_key] = [
1✔
92
                    e
93
                    for e in self.page.blocks
94
                    if self.result.get_label(e) == label
95
                    and not self.result.is_removed(e)
96
                ]
97
        return self._cache[cache_key]
1✔
98

99
    def parts_lists(self) -> list[Block]:
1✔
100
        """Get all non-deleted parts_list elements."""
101
        return self.elements_by_label("parts_list")
1✔
102

103
    def part_images(self) -> list[Block]:
1✔
104
        """Get all non-deleted part_image elements."""
105
        return self.elements_by_label("part_image")
1✔
106

107
    def part_counts(self) -> list[Block]:
1✔
108
        """Get all non-deleted part_count elements."""
109
        return self.elements_by_label("part_count")
1✔
110

111
    def step_numbers(self) -> list[Block]:
1✔
112
        """Get all non-deleted step_number elements."""
113
        return self.elements_by_label("step_number")
×
114

115
    def children_of(self, parent: Block, label: str | None = None) -> list[Block]:
1✔
116
        """Return all non-deleted elements spatially contained within a parent element.
117

118
        Note: This uses bbox containment, not ElementTree hierarchy, because
119
        the hierarchy is based on "smallest containing bbox" which means there
120
        may be intermediate unlabeled elements between a parent and its
121
        logical children. For validation rules about spatial containment,
122
        bbox checking is more appropriate.
123

124
        Args:
125
            parent: The parent element to search within
126
            label: Optional label filter (e.g., "part_image")
127

128
        Returns:
129
            List of non-deleted Elements matching the label (if specified) that
130
            are fully contained within the parent's bbox
131
        """
132
        # Use spatial containment, not hierarchy
133
        result = []
1✔
134
        for elem in self.page.blocks:
1✔
135
            if id(elem) in self.result.removal_reasons:
1✔
136
                continue
×
137
            if label is not None and self.result.get_label(elem) != label:
1✔
138
                continue
1✔
139
            if elem.bbox.fully_inside(parent.bbox):
1✔
140
                result.append(elem)
1✔
141
        return result
1✔
142

143
    def print_summary(self, logger: logging.Logger | None = None) -> None:
1✔
144
        """Log a summary of labeled elements.
145

146
        Args:
147
            logger: Logger to use (defaults to module logger)
148
        """
149
        logger = logger or log
1✔
150
        label_counts = defaultdict(int)
1✔
151
        for e in self.page.blocks:
1✔
152
            label = (
1✔
153
                self.result.get_label(e) if self.result.get_label(e) else "<unknown>"
154
            )
155
            label_counts[label] += 1
1✔
156

157
        logger.info(f"Label counts: {dict(label_counts)}")
1✔
158

159

160
# TODO Replace this with just results.get_blocks_by_label()
161

162

163
def _parts_lists(page: PageData, result: ClassificationResult) -> list[Block]:
1✔
164
    return [
×
165
        e
166
        for e in page.blocks
167
        if result.get_label(e) == "parts_list" and not result.is_removed(e)
168
    ]
169

170

171
# TODO Replace this with just results.get_blocks_by_label()
172

173

174
def _part_images(page: PageData, result: ClassificationResult) -> list[Block]:
1✔
175
    return [
×
176
        e
177
        for e in page.blocks
178
        if result.get_label(e) == "part_image" and not result.is_removed(e)
179
    ]
180

181

182
# TODO Replace this with just results.get_blocks_by_label()
183

184

185
def _part_counts(page: PageData, result: ClassificationResult) -> list[Block]:
1✔
186
    return [
×
187
        e
188
        for e in page.blocks
189
        if result.get_label(e) == "part_count" and not result.is_removed(e)
190
    ]
191

192

193
def _print_label_counts(page: PageData, result: ClassificationResult) -> None:
1✔
194
    label_counts = defaultdict(int)
×
195
    for e in page.blocks:
×
196
        label = result.get_label(e) if result.get_label(e) else "<unknown>"
×
197
        label_counts[label] += 1
×
198

199
    # TODO The following logging shows "defaultdict(<class 'int'>,..." figure
200
    # out how to avoid that.
201
    log.info(f"Label counts: {label_counts}")
×
202

203

204
class TestClassifierRules:
1✔
205
    """End-to-end rules that must hold on real pages after classification."""
206

207
    @pytest.mark.parametrize(
1✔
208
        "fixture_file",
209
        [
210
            f.name
211
            for f in (Path(__file__).parent.parent / "fixtures").glob("*_raw.json")
212
        ],
213
    )
214
    def test_parts_list_contains_at_least_one_part_image(
1✔
215
        self, fixture_file: str
216
    ) -> None:
217
        """Every labeled parts list should include at least one part image
218
        inside its bbox.
219

220
        This test runs on all JSON fixtures in the fixtures/ directory.
221
        """
222

223
        pages = _load_pages_from_fixture(fixture_file)
1✔
224

225
        for page_idx, page in enumerate(pages):
1✔
226
            # Run the full classification pipeline on the page
227
            result = classify_elements(page)
1✔
228

229
            classified = ClassifiedPage(page, result)
1✔
230
            classified.print_summary()
1✔
231

232
            parts_lists = classified.parts_lists()
1✔
233
            part_images = classified.part_images()
1✔
234
            part_counts = classified.part_counts()
1✔
235

236
            # Debug: show all part_image labeled elements including deleted ones
237
            all_part_images = classified.elements_by_label(
1✔
238
                "part_image", include_deleted=True
239
            )
240
            log.info(
1✔
241
                f"Page {page_idx}: Total on page: {len(parts_lists)} parts_lists, "
242
                f"{len(part_images)} part_images (non-deleted), "
243
                f"{len(all_part_images)} total part_images, "
244
                f"{len(part_counts)} part_counts"
245
            )
246
            if len(all_part_images) != len(part_images):
1✔
247
                deleted_count = len(all_part_images) - len(part_images)
×
248
                log.warning(
×
249
                    f"  WARNING: {deleted_count} part_images are DELETED on this page"
250
                )
251
                for img in all_part_images:
×
252
                    if result.is_removed(img):
×
253
                        # Check if it's inside any parts_list
254
                        inside_any = any(
×
255
                            img.bbox.fully_inside(pl.bbox) for pl in parts_lists
256
                        )
257
                        location = (
×
258
                            "inside a parts_list"
259
                            if inside_any
260
                            else "outside all parts_lists"
261
                        )
262
                        log.warning(
×
263
                            f"    - Deleted PartImage id:{img.id} "
264
                            f"bbox:{img.bbox} ({location})"
265
                        )
266

267
            for parts_list in parts_lists:
1✔
268
                part_images_inside = classified.children_of(
1✔
269
                    parts_list, label="part_image"
270
                )
271
                part_counts_inside = classified.children_of(
1✔
272
                    parts_list, label="part_count"
273
                )
274

275
                # Also get ALL part_images (including deleted) to check for deletion bugs
276
                all_part_images_inside = []
1✔
277
                for elem in page.blocks:
1✔
278
                    if result.get_label(
1✔
279
                        elem
280
                    ) == "part_image" and elem.bbox.fully_inside(parts_list.bbox):
281
                        all_part_images_inside.append(elem)
1✔
282

283
                log.info(
1✔
284
                    f"{fixture_file} page {page_idx} PartsList id:{parts_list.id} "
285
                    f"bbox:{parts_list.bbox} contains:"
286
                )
287
                for img in part_images_inside:
1✔
288
                    log.info(f" - PartImage id:{img.id} bbox:{img.bbox}")
1✔
289
                for count in part_counts_inside:
1✔
290
                    count_text = count.text if isinstance(count, Text) else ""
1✔
291
                    log.info(
1✔
292
                        f" - PartCount id:{count.id} text:{count_text} bbox:{count.bbox}"
293
                    )
294

295
                # Log deleted part_images if any
296
                deleted_images = [
1✔
297
                    img for img in all_part_images_inside if result.is_removed(img)
298
                ]
299
                if deleted_images:
1✔
300
                    log.warning(
×
301
                        f"  WARNING: {len(deleted_images)} part_images DELETED "
302
                        f"inside parts_list {parts_list.id}:"
303
                    )
304
                    for img in deleted_images:
×
305
                        log.warning(
×
306
                            f"    - PartImage id:{img.id} bbox:{img.bbox} [DELETED]"
307
                        )
308

309
                # Debug: log all part images to see why they're not inside
310
                if len(part_images_inside) == 0:
1✔
311
                    log.info("  DEBUG: All part_images on page:")
×
312
                    for img in part_images:
×
313
                        log.info(
×
314
                            f"  - PartImage id:{img.id} bbox:{img.bbox} "
315
                            f"inside:{img.bbox.fully_inside(parts_list.bbox)}"
316
                        )
317
                # Each parts_list must contain at least one part_image fully inside its bbox
318
                assert len(part_images_inside) >= 1, (
1✔
319
                    f"Parts list {parts_list.id} in {fixture_file} page {page_idx} "
320
                    f"should contain at least one part image"
321
                )
322

323
                # No part_images inside a parts_list should be deleted
324
                assert len(deleted_images) == 0, (
1✔
325
                    f"Parts list {parts_list.id} in {fixture_file} page {page_idx} has "
326
                    f"{len(deleted_images)} deleted part_images inside it (should be 0)"
327
                )
328

329
                # Each parts_list must contain the same number of part_counts as
330
                # part_images inside it
331
                assert len(part_counts_inside) == len(part_images_inside), (
1✔
332
                    f"PartsList id:{parts_list.id} in {fixture_file} page {page_idx} "
333
                    f"should contain {len(part_images_inside)} PartCounts, "
334
                    f"found {len(part_counts_inside)}"
335
                )
336

337
    @pytest.mark.parametrize(
1✔
338
        "fixture_file",
339
        [
340
            f.name
341
            for f in (Path(__file__).parent.parent / "fixtures").glob("*_raw.json")
342
        ],
343
    )
344
    def test_parts_lists_do_not_overlap(self, fixture_file: str) -> None:
1✔
345
        """No two parts lists should overlap.
346

347
        Parts lists represent distinct areas of the page and should not
348
        have overlapping bounding boxes.
349
        """
350
        pages = _load_pages_from_fixture(fixture_file)
1✔
351

352
        for page_idx, page in enumerate(pages):
1✔
353
            # Run the full classification pipeline on the page
354
            result = classify_elements(page)
1✔
355

356
            classified = ClassifiedPage(page, result)
1✔
357
            parts_lists = classified.parts_lists()
1✔
358

359
            # Check all pairs of parts lists for overlap
360
            for i, parts_list_a in enumerate(parts_lists):
1✔
361
                for parts_list_b in parts_lists[i + 1 :]:
1✔
362
                    assert not parts_list_a.bbox.overlaps(parts_list_b.bbox), (
1✔
363
                        f"Parts lists {parts_list_a.id} (bbox:{parts_list_a.bbox}) and "
364
                        f"{parts_list_b.id} (bbox:{parts_list_b.bbox}) in "
365
                        f"{fixture_file} page {page_idx} overlap"
366
                    )
367

368
    @pytest.mark.parametrize(
1✔
369
        "fixture_file",
370
        [
371
            f.name
372
            for f in (Path(__file__).parent.parent / "fixtures").glob("*_raw.json")
373
        ],
374
    )
375
    def test_each_part_image_is_inside_a_parts_list(self, fixture_file: str) -> None:
1✔
376
        """Each part image must be inside at least one parts list.
377

378
        Every part_image should be contained within a parts_list's bounding box.
379
        """
380
        pages = _load_pages_from_fixture(fixture_file)
1✔
381

382
        for page_idx, page in enumerate(pages):
1✔
383
            # Run the full classification pipeline on the page
384
            result = classify_elements(page)
1✔
385

386
            classified = ClassifiedPage(page, result)
1✔
387
            parts_lists = classified.parts_lists()
1✔
388
            part_images = classified.part_images()
1✔
389

390
            for part_image in part_images:
1✔
391
                # Check if this part_image is inside at least one parts_list
392
                inside_any_parts_list = any(
1✔
393
                    part_image.bbox.fully_inside(pl.bbox) for pl in parts_lists
394
                )
395

396
                assert inside_any_parts_list, (
1✔
397
                    f"Part image {part_image.id} (bbox:{part_image.bbox}) in "
398
                    f"{fixture_file} page {page_idx} is not inside any parts_list"
399
                )
400

401
    @pytest.mark.parametrize(
1✔
402
        "fixture_file",
403
        [
404
            f.name
405
            for f in (Path(__file__).parent.parent / "fixtures").glob("*_raw.json")
406
        ],
407
    )
408
    def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:
1✔
409
        """No element with a label should be marked as deleted.
410

411
        If an element has been classified with a label, it should not be deleted.
412
        This ensures that the classification and deletion logic don't conflict.
413
        """
414
        pages = _load_pages_from_fixture(fixture_file)
1✔
415

416
        for page_idx, page in enumerate(pages):
1✔
417
            # Run the full classification pipeline on the page
418
            result = classify_elements(page)
1✔
419

420
            # Find all elements that are both labeled and deleted
421
            labeled_and_deleted = []
1✔
422
            for elem in page.blocks:
1✔
423
                if result.get_label(elem) is not None and result.is_removed(elem):
1✔
424
                    labeled_and_deleted.append(elem)
×
425

426
            if labeled_and_deleted:
1✔
427
                log.error(
×
428
                    f"Found {len(labeled_and_deleted)} labeled elements that are deleted:"
429
                )
430
                for elem in labeled_and_deleted:
×
431
                    log.error(
×
432
                        f"  - {result.get_label(elem)} id:{elem.id} "
433
                        f"bbox:{elem.bbox} [DELETED]"
434
                    )
435

436
            assert len(labeled_and_deleted) == 0, (
1✔
437
                f"Found {len(labeled_and_deleted)} labeled elements that are "
438
                f"deleted in {fixture_file} page {page_idx}. "
439
                f"Labeled elements should not be deleted."
440
            )
441

442
    @pytest.mark.parametrize(
1✔
443
        "fixture_file",
444
        [
445
            f.name
446
            for f in (Path(__file__).parent.parent / "fixtures").glob("*_raw.json")
447
        ],
448
    )
449
    def test_each_element_has_at_most_one_winner(self, fixture_file: str) -> None:
1✔
450
        """Each element should have at most one winner candidate across all labels.
451

452
        An element can have multiple candidates across different labels, but only
453
        one of them should be marked as a winner. This ensures classification
454
        decisions are unambiguous.
455
        """
456
        pages = _load_pages_from_fixture(fixture_file)
1✔
457

458
        for page_idx, page in enumerate(pages):
1✔
459
            # Run the full classification pipeline on the page
460
            result = classify_elements(page)
1✔
461

462
            # Track which blocks have won, and for which label
463
            block_to_winning_label: dict[int, str] = {}
1✔
464

465
            # Check all candidates across all labels
466
            all_candidates = result.get_all_candidates()
1✔
467
            for label, candidates in all_candidates.items():
1✔
468
                for candidate in candidates:
1✔
469
                    if not candidate.is_winner:
1✔
470
                        continue
1✔
471

472
                    # Skip synthetic candidates (no source block)
473
                    if candidate.source_block is None:
1✔
474
                        continue
1✔
475

476
                    block_id = candidate.source_block.id
1✔
477

478
                    # Check if this block already has a winner
479
                    if block_id in block_to_winning_label:
1✔
480
                        existing_label = block_to_winning_label[block_id]
×
481
                        pytest.fail(
×
482
                            f"Block {block_id} in {fixture_file} page {page_idx} has multiple "
483
                            f"winner candidates: '{existing_label}' and '{label}'. "
484
                            "Each block should have at most one winner."
485
                        )
486

487
                    block_to_winning_label[block_id] = label
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc