• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19217904329

10 Nov 2025 01:35AM UTC coverage: 87.121% (+0.7%) from 86.426%
19217904329

push

github

bramp
Bumped some dependencies.

4600 of 5280 relevant lines covered (87.12%)

0.87 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

21.02
/src/build_a_long/pdf_extract/classifier/classifier_rules_test.py
1
"""Rule-based tests over real fixtures for the PDF element classifier.
2

3
This suite validates high-level invariants that must hold after classification.
4

5
Rules covered:
6
- Every parts list must contain at least one part image inside it.
7
- No two parts lists overlap.
8
- Each part image is inside a parts list.
9
- Each element has at most one winner candidate.
10

11
Real fixture(s) live under this package's fixtures/ directory.
12
"""
13

14
import logging
1✔
15
from collections import defaultdict
1✔
16
from pathlib import Path
1✔
17

18
import pytest
1✔
19

20
from build_a_long.pdf_extract.classifier import ClassificationResult, classify_elements
1✔
21
from build_a_long.pdf_extract.extractor import PageData
1✔
22
from build_a_long.pdf_extract.extractor.page_blocks import Block, Text
1✔
23

24
log = logging.getLogger(__name__)
1✔
25

26
# TODO A lot of the methods in ClassifiedPage overlap with ClassificationResult
27

28

29
class ClassifiedPage:
1✔
30
    """Wrapper around PageData providing convenient access to classified elements.
31

32
    This class provides helper methods to query elements by label type and
33
    supports hierarchical queries (e.g., finding children inside parent bboxes).
34
    Results are cached for efficiency.
35
    """
36

37
    def __init__(self, page: PageData, result: ClassificationResult):
1✔
38
        """Initialize with a classified PageData and its result.
39

40
        Args:
41
            page: PageData that has been run through classify_elements()
42
            result: The ClassificationResult for this page
43
        """
44
        self.page = page
×
45
        self.result = result
×
46
        self._cache: dict[str, list[Block]] = {}
×
47

48
    def elements_by_label(
1✔
49
        self, label: str, include_deleted: bool = False
50
    ) -> list[Block]:
51
        """Get all elements with the given label.
52

53
        Args:
54
            label: The label to filter by
55
            include_deleted: Whether to include deleted elements
56

57
        Returns:
58
            List of elements with matching label
59
        """
60
        cache_key = f"{label}:deleted={include_deleted}"
×
61
        if cache_key not in self._cache:
×
62
            if include_deleted:
×
63
                self._cache[cache_key] = [
×
64
                    e for e in self.page.blocks if self.result.get_label(e) == label
65
                ]
66
            else:
67
                self._cache[cache_key] = [
×
68
                    e
69
                    for e in self.page.blocks
70
                    if self.result.get_label(e) == label
71
                    and not self.result.is_removed(e)
72
                ]
73
        return self._cache[cache_key]
×
74

75
    def parts_lists(self) -> list[Block]:
1✔
76
        """Get all non-deleted parts_list elements."""
77
        return self.elements_by_label("parts_list")
×
78

79
    def part_images(self) -> list[Block]:
1✔
80
        """Get all non-deleted part_image elements."""
81
        return self.elements_by_label("part_image")
×
82

83
    def part_counts(self) -> list[Block]:
1✔
84
        """Get all non-deleted part_count elements."""
85
        return self.elements_by_label("part_count")
×
86

87
    def step_numbers(self) -> list[Block]:
1✔
88
        """Get all non-deleted step_number elements."""
89
        return self.elements_by_label("step_number")
×
90

91
    def children_of(self, parent: Block, label: str | None = None) -> list[Block]:
1✔
92
        """Return all non-deleted elements spatially contained within a parent element.
93

94
        Note: This uses bbox containment, not ElementTree hierarchy, because
95
        the hierarchy is based on "smallest containing bbox" which means there
96
        may be intermediate unlabeled elements between a parent and its
97
        logical children. For validation rules about spatial containment,
98
        bbox checking is more appropriate.
99

100
        Args:
101
            parent: The parent element to search within
102
            label: Optional label filter (e.g., "part_image")
103

104
        Returns:
105
            List of non-deleted Elements matching the label (if specified) that
106
            are fully contained within the parent's bbox
107
        """
108
        # Use spatial containment, not hierarchy
109
        result = []
×
110
        for elem in self.page.blocks:
×
111
            if id(elem) in self.result.removal_reasons:
×
112
                continue
×
113
            if label is not None and self.result.get_label(elem) != label:
×
114
                continue
×
115
            if elem.bbox.fully_inside(parent.bbox):
×
116
                result.append(elem)
×
117
        return result
×
118

119
    def print_summary(self, logger: logging.Logger | None = None) -> None:
1✔
120
        """Log a summary of labeled elements.
121

122
        Args:
123
            logger: Logger to use (defaults to module logger)
124
        """
125
        logger = logger or log
×
126
        label_counts = defaultdict(int)
×
127
        for e in self.page.blocks:
×
128
            label = (
×
129
                self.result.get_label(e) if self.result.get_label(e) else "<unknown>"
130
            )
131
            label_counts[label] += 1
×
132

133
        logger.info(f"Label counts: {dict(label_counts)}")
×
134

135

136
# TODO Replace this with just results.get_blocks_by_label()
137

138

139
def _parts_lists(page: PageData, result: ClassificationResult) -> list[Block]:
1✔
140
    return [
×
141
        e
142
        for e in page.blocks
143
        if result.get_label(e) == "parts_list" and not result.is_removed(e)
144
    ]
145

146

147
# TODO Replace this with just results.get_blocks_by_label()
148

149

150
def _part_images(page: PageData, result: ClassificationResult) -> list[Block]:
1✔
151
    return [
×
152
        e
153
        for e in page.blocks
154
        if result.get_label(e) == "part_image" and not result.is_removed(e)
155
    ]
156

157

158
# TODO Replace this with just results.get_blocks_by_label()
159

160

161
def _part_counts(page: PageData, result: ClassificationResult) -> list[Block]:
1✔
162
    return [
×
163
        e
164
        for e in page.blocks
165
        if result.get_label(e) == "part_count" and not result.is_removed(e)
166
    ]
167

168

169
def _print_label_counts(page: PageData, result: ClassificationResult) -> None:
1✔
170
    label_counts = defaultdict(int)
×
171
    for e in page.blocks:
×
172
        label = result.get_label(e) if result.get_label(e) else "<unknown>"
×
173
        label_counts[label] += 1
×
174

175
    # TODO The following logging shows "defaultdict(<class 'int'>,..." figure
176
    # out how to avoid that.
177
    log.info(f"Label counts: {label_counts}")
×
178

179

180
@pytest.mark.skip(reason="Not working yet.")
1✔
181
class TestClassifierRules:
1✔
182
    """End-to-end rules that must hold on real pages after classification."""
183

184
    @pytest.mark.parametrize(
1✔
185
        "fixture_file",
186
        [f.name for f in (Path(__file__).parent.parent / "fixtures").glob("*.json")],
187
    )
188
    def test_parts_list_contains_at_least_one_part_image(
1✔
189
        self, fixture_file: str
190
    ) -> None:
191
        """Every labeled parts list should include at least one part image
192
        inside its bbox.
193

194
        This test runs on all JSON fixtures in the fixtures/ directory.
195
        """
196

197
        fixture_path = Path(__file__).parent.parent / "fixtures" / fixture_file
×
198
        page: PageData = PageData.model_validate_json(fixture_path.read_text())
×
199

200
        # Run the full classification pipeline on the page
201
        result = classify_elements(page)
×
202

203
        classified = ClassifiedPage(page, result)
×
204
        classified.print_summary()
×
205

206
        parts_lists = classified.parts_lists()
×
207
        part_images = classified.part_images()
×
208
        part_counts = classified.part_counts()
×
209

210
        # Debug: show all part_image labeled elements including deleted ones
211
        all_part_images = classified.elements_by_label(
×
212
            "part_image", include_deleted=True
213
        )
214
        log.info(
×
215
            f"Total on page: {len(parts_lists)} parts_lists, "
216
            f"{len(part_images)} part_images (non-deleted), "
217
            f"{len(all_part_images)} total part_images, "
218
            f"{len(part_counts)} part_counts"
219
        )
220
        if len(all_part_images) != len(part_images):
×
221
            deleted_count = len(all_part_images) - len(part_images)
×
222
            log.warning(
×
223
                f"  WARNING: {deleted_count} part_images are DELETED on this page"
224
            )
225
            for img in all_part_images:
×
226
                if result.is_removed(img):
×
227
                    # Check if it's inside any parts_list
228
                    inside_any = any(
×
229
                        img.bbox.fully_inside(pl.bbox) for pl in parts_lists
230
                    )
231
                    location = (
×
232
                        "inside a parts_list"
233
                        if inside_any
234
                        else "outside all parts_lists"
235
                    )
236
                    log.warning(
×
237
                        f"    - Deleted PartImage id:{img.id} "
238
                        f"bbox:{img.bbox} ({location})"
239
                    )
240

241
        for parts_list in parts_lists:
×
242
            part_images_inside = classified.children_of(parts_list, label="part_image")
×
243
            part_counts_inside = classified.children_of(parts_list, label="part_count")
×
244

245
            # Also get ALL part_images (including deleted) to check for deletion bugs
246
            all_part_images_inside = []
×
247
            for elem in page.blocks:
×
248
                if result.get_label(elem) == "part_image" and elem.bbox.fully_inside(
×
249
                    parts_list.bbox
250
                ):
251
                    all_part_images_inside.append(elem)
×
252

253
            log.info(
×
254
                f"{fixture_file} PartsList id:{parts_list.id} "
255
                f"bbox:{parts_list.bbox} contains:"
256
            )
257
            for img in part_images_inside:
×
258
                log.info(f" - PartImage id:{img.id} bbox:{img.bbox}")
×
259
            for count in part_counts_inside:
×
260
                count_text = count.text if isinstance(count, Text) else ""
×
261
                log.info(
×
262
                    f" - PartCount id:{count.id} text:{count_text} bbox:{count.bbox}"
263
                )
264

265
            # Log deleted part_images if any
266
            deleted_images = [
×
267
                img for img in all_part_images_inside if result.is_removed(img)
268
            ]
269
            if deleted_images:
×
270
                log.warning(
×
271
                    f"  WARNING: {len(deleted_images)} part_images DELETED "
272
                    f"inside parts_list {parts_list.id}:"
273
                )
274
                for img in deleted_images:
×
275
                    log.warning(
×
276
                        f"    - PartImage id:{img.id} bbox:{img.bbox} [DELETED]"
277
                    )
278

279
            # Debug: log all part images to see why they're not inside
280
            if len(part_images_inside) == 0:
×
281
                log.info("  DEBUG: All part_images on page:")
×
282
                for img in part_images:
×
283
                    log.info(
×
284
                        f"  - PartImage id:{img.id} bbox:{img.bbox} "
285
                        f"inside:{img.bbox.fully_inside(parts_list.bbox)}"
286
                    )
287
            # Each parts_list must contain at least one part_image fully inside its bbox
288
            assert len(part_images_inside) >= 1, (
×
289
                f"Parts list {parts_list.id} in {fixture_file} should contain "
290
                f"at least one part image"
291
            )
292

293
            # No part_images inside a parts_list should be deleted
294
            assert len(deleted_images) == 0, (
×
295
                f"Parts list {parts_list.id} in {fixture_file} has "
296
                f"{len(deleted_images)} deleted part_images inside it (should be 0)"
297
            )
298

299
            # Each parts_list must contain the same number of part_counts as
300
            # part_images inside it
301
            assert len(part_counts_inside) == len(part_images_inside), (
×
302
                f"PartsList id:{parts_list.id} in {fixture_file} should contain "
303
                f"{len(part_images_inside)} PartCounts, found {len(part_counts_inside)}"
304
            )
305

306
    @pytest.mark.parametrize(
1✔
307
        "fixture_file",
308
        [f.name for f in (Path(__file__).parent.parent / "fixtures").glob("*.json")],
309
    )
310
    def test_parts_lists_do_not_overlap(self, fixture_file: str) -> None:
1✔
311
        """No two parts lists should overlap.
312

313
        Parts lists represent distinct areas of the page and should not
314
        have overlapping bounding boxes.
315
        """
316
        fixture_path = Path(__file__).parent.parent / "fixtures" / fixture_file
×
317
        page: PageData = PageData.model_validate_json(fixture_path.read_text())
×
318

319
        # Run the full classification pipeline on the page
320
        result = classify_elements(page)
×
321

322
        classified = ClassifiedPage(page, result)
×
323
        parts_lists = classified.parts_lists()
×
324

325
        # Check all pairs of parts lists for overlap
326
        for i, parts_list_a in enumerate(parts_lists):
×
327
            for parts_list_b in parts_lists[i + 1 :]:
×
328
                assert not parts_list_a.bbox.overlaps(parts_list_b.bbox), (
×
329
                    f"Parts lists {parts_list_a.id} (bbox:{parts_list_a.bbox}) and "
330
                    f"{parts_list_b.id} (bbox:{parts_list_b.bbox}) in "
331
                    f"{fixture_file} overlap"
332
                )
333

334
    @pytest.mark.parametrize(
1✔
335
        "fixture_file",
336
        [f.name for f in (Path(__file__).parent.parent / "fixtures").glob("*.json")],
337
    )
338
    def test_each_part_image_is_inside_a_parts_list(self, fixture_file: str) -> None:
1✔
339
        """Each part image must be inside at least one parts list.
340

341
        Every part_image should be contained within a parts_list's bounding box.
342
        """
343
        fixture_path = Path(__file__).parent.parent / "fixtures" / fixture_file
×
344
        page: PageData = PageData.model_validate_json(fixture_path.read_text())
×
345

346
        # Run the full classification pipeline on the page
347
        result = classify_elements(page)
×
348

349
        classified = ClassifiedPage(page, result)
×
350
        parts_lists = classified.parts_lists()
×
351
        part_images = classified.part_images()
×
352

353
        for part_image in part_images:
×
354
            # Check if this part_image is inside at least one parts_list
355
            inside_any_parts_list = any(
×
356
                part_image.bbox.fully_inside(pl.bbox) for pl in parts_lists
357
            )
358

359
            assert inside_any_parts_list, (
×
360
                f"Part image {part_image.id} (bbox:{part_image.bbox}) in "
361
                f"{fixture_file} is not inside any parts_list"
362
            )
363

364
    @pytest.mark.parametrize(
1✔
365
        "fixture_file",
366
        [f.name for f in (Path(__file__).parent.parent / "fixtures").glob("*.json")],
367
    )
368
    def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:
1✔
369
        """No element with a label should be marked as deleted.
370

371
        If an element has been classified with a label, it should not be deleted.
372
        This ensures that the classification and deletion logic don't conflict.
373
        """
374
        fixture_path = Path(__file__).parent.parent / "fixtures" / fixture_file
×
375
        page: PageData = PageData.model_validate_json(fixture_path.read_text())
×
376

377
        # Run the full classification pipeline on the page
378
        result = classify_elements(page)
×
379

380
        # Find all elements that are both labeled and deleted
381
        labeled_and_deleted = []
×
382
        for elem in page.blocks:
×
383
            if result.get_label(elem) is not None and result.is_removed(elem):
×
384
                labeled_and_deleted.append(elem)
×
385

386
        if labeled_and_deleted:
×
387
            log.error(
×
388
                f"Found {len(labeled_and_deleted)} labeled elements that are deleted:"
389
            )
390
            for elem in labeled_and_deleted:
×
391
                log.error(
×
392
                    f"  - {result.get_label(elem)} id:{elem.id} "
393
                    f"bbox:{elem.bbox} [DELETED]"
394
                )
395

396
        assert len(labeled_and_deleted) == 0, (
×
397
            f"Found {len(labeled_and_deleted)} labeled elements that are "
398
            f"deleted in {fixture_file}. Labeled elements should not be deleted."
399
        )
400

401
    @pytest.mark.parametrize(
1✔
402
        "fixture_file",
403
        [f.name for f in (Path(__file__).parent.parent / "fixtures").glob("*.json")],
404
    )
405
    def test_each_element_has_at_most_one_winner(self, fixture_file: str) -> None:
1✔
406
        """Each element should have at most one winner candidate across all labels.
407

408
        An element can have multiple candidates across different labels, but only
409
        one of them should be marked as a winner. This ensures classification
410
        decisions are unambiguous.
411
        """
412
        fixture_path = Path(__file__).parent.parent / "fixtures" / fixture_file
×
413
        page: PageData = PageData.model_validate_json(fixture_path.read_text())
×
414

415
        # Run the full classification pipeline on the page
416
        result = classify_elements(page)
×
417

418
        # Track which blocks have won, and for which label
419
        block_to_winning_label: dict[int, str] = {}
×
420

421
        # Check all candidates across all labels
422
        all_candidates = result.get_all_candidates()
×
423
        for label, candidates in all_candidates.items():
×
424
            for candidate in candidates:
×
425
                if not candidate.is_winner:
×
426
                    continue
×
427

428
                # Skip synthetic candidates (no source block)
429
                if candidate.source_block is None:
×
430
                    continue
×
431

432
                block_id = candidate.source_block.id
×
433

434
                # Check if this block already has a winner
435
                if block_id in block_to_winning_label:
×
436
                    existing_label = block_to_winning_label[block_id]
×
437
                    pytest.fail(
×
438
                        f"Block {block_id} in {fixture_file} has multiple "
439
                        f"winner candidates: '{existing_label}' and '{label}'. "
440
                        "Each block should have at most one winner."
441
                    )
442

443
                block_to_winning_label[block_id] = label
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc