• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19155446196

06 Nov 2025 04:44AM UTC coverage: 85.36% (-0.02%) from 85.381%
19155446196

push

github

bramp
refactor: complete Block rename and terminology cleanup

Renamed page_elements.py → page_blocks.py and systematically updated all
references to use 'block' terminology for raw PDF primitives throughout
the codebase.

Key changes:
- Renamed Element class → Block in page_blocks.py
- Updated all imports and type references across 40+ files
- Renamed internal variables and method parameters:
  - _element_winners → _block_winners
  - _validate_element_in_page_data() → _validate_block_in_page_data()
  - element_to_labels → block_to_labels
  - total_elements → total_blocks
  - And many more variable renames in main.py, tests, and classifiers
- Updated all docstrings, comments, and error messages
- Updated JSON fixtures to use 'blocks' instead of 'elements'
- Updated documentation (README files)

Terminology is now consistent:
- Block = raw PDF primitive (Text, Image, Drawing from pymupdf)
- Element = LEGO semantic component (Part, StepNumber, PartsList, etc.)

All 20 tests passing.

472 of 535 new or added lines in 34 files covered. (88.22%)

6 existing lines in 3 files now uncovered.

4064 of 4761 relevant lines covered (85.36%)

0.85 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

21.02
/src/build_a_long/pdf_extract/classifier/classifier_rules_test.py
1
"""Rule-based tests over real fixtures for the PDF element classifier.
2

3
This suite validates high-level invariants that must hold after classification.
4

5
Rules covered:
6
- Every parts list must contain at least one part image inside it.
7
- No two parts lists overlap.
8
- Each part image is inside a parts list.
9
- Each element has at most one winner candidate.
10

11
Real fixture(s) live under this package's fixtures/ directory.
12
"""
13

14
import logging
1✔
15
from collections import defaultdict
1✔
16
from pathlib import Path
1✔
17

18
import pytest
1✔
19

20
from build_a_long.pdf_extract.classifier import ClassificationResult, classify_elements
1✔
21
from build_a_long.pdf_extract.extractor import PageData
1✔
22
from build_a_long.pdf_extract.extractor.page_blocks import Block, Text
1✔
23

24
log = logging.getLogger(__name__)
1✔
25

26
# TODO A lot of the methods in ClassifiedPage overlap with ClassificationResult
27

28

29
class ClassifiedPage:
1✔
30
    """Wrapper around PageData providing convenient access to classified elements.
31

32
    This class provides helper methods to query elements by label type and
33
    supports hierarchical queries (e.g., finding children inside parent bboxes).
34
    Results are cached for efficiency.
35
    """
36

37
    def __init__(self, page: PageData, result: ClassificationResult):
1✔
38
        """Initialize with a classified PageData and its result.
39

40
        Args:
41
            page: PageData that has been run through classify_elements()
42
            result: The ClassificationResult for this page
43
        """
44
        self.page = page
×
45
        self.result = result
×
NEW
46
        self._cache: dict[str, list[Block]] = {}
×
47

48
    def elements_by_label(
1✔
49
        self, label: str, include_deleted: bool = False
50
    ) -> list[Block]:
51
        """Get all elements with the given label.
52

53
        Args:
54
            label: The label to filter by
55
            include_deleted: Whether to include deleted elements
56

57
        Returns:
58
            List of elements with matching label
59
        """
60
        cache_key = f"{label}:deleted={include_deleted}"
×
61
        if cache_key not in self._cache:
×
62
            if include_deleted:
×
63
                self._cache[cache_key] = [
×
64
                    e for e in self.page.blocks if self.result.get_label(e) == label
65
                ]
66
            else:
67
                self._cache[cache_key] = [
×
68
                    e
69
                    for e in self.page.blocks
70
                    if self.result.get_label(e) == label
71
                    and not self.result.is_removed(e)
72
                ]
73
        return self._cache[cache_key]
×
74

75
    def parts_lists(self) -> list[Block]:
1✔
76
        """Get all non-deleted parts_list elements."""
77
        return self.elements_by_label("parts_list")
×
78

79
    def part_images(self) -> list[Block]:
1✔
80
        """Get all non-deleted part_image elements."""
81
        return self.elements_by_label("part_image")
×
82

83
    def part_counts(self) -> list[Block]:
1✔
84
        """Get all non-deleted part_count elements."""
85
        return self.elements_by_label("part_count")
×
86

87
    def step_numbers(self) -> list[Block]:
1✔
88
        """Get all non-deleted step_number elements."""
89
        return self.elements_by_label("step_number")
×
90

91
    def children_of(self, parent: Block, label: str | None = None) -> list[Block]:
1✔
92
        """Return all non-deleted elements spatially contained within a parent element.
93

94
        Note: This uses bbox containment, not ElementTree hierarchy, because the hierarchy
95
        is based on "smallest containing bbox" which means there may be intermediate
96
        unlabeled elements between a parent and its logical children. For validation
97
        rules about spatial containment, bbox checking is more appropriate.
98

99
        Args:
100
            parent: The parent element to search within
101
            label: Optional label filter (e.g., "part_image")
102

103
        Returns:
104
            List of non-deleted Elements matching the label (if specified) that
105
            are fully contained within the parent's bbox
106
        """
107
        # Use spatial containment, not hierarchy
108
        result = []
×
NEW
109
        for elem in self.page.blocks:
×
110
            if id(elem) in self.result._removal_reasons:
×
111
                continue
×
112
            if label is not None and self.result.get_label(elem) != label:
×
113
                continue
×
114
            if elem.bbox.fully_inside(parent.bbox):
×
115
                result.append(elem)
×
116
        return result
×
117

118
    def print_summary(self, logger: logging.Logger | None = None) -> None:
1✔
119
        """Log a summary of labeled elements.
120

121
        Args:
122
            logger: Logger to use (defaults to module logger)
123
        """
124
        logger = logger or log
×
125
        label_counts = defaultdict(int)
×
NEW
126
        for e in self.page.blocks:
×
127
            label = (
×
128
                self.result.get_label(e) if self.result.get_label(e) else "<unknown>"
129
            )
130
            label_counts[label] += 1
×
131

132
        logger.info(f"Label counts: {dict(label_counts)}")
×
133

134

135
# TODO Replace this with just results.get_blocks_by_label()
136

137

138
def _parts_lists(page: PageData, result: ClassificationResult) -> list[Block]:
1✔
139
    return [
×
140
        e
141
        for e in page.blocks
142
        if result.get_label(e) == "parts_list" and not result.is_removed(e)
143
    ]
144

145

146
# TODO Replace this with just results.get_blocks_by_label()
147

148

149
def _part_images(page: PageData, result: ClassificationResult) -> list[Block]:
1✔
150
    return [
×
151
        e
152
        for e in page.blocks
153
        if result.get_label(e) == "part_image" and not result.is_removed(e)
154
    ]
155

156

157
# TODO Replace this with just results.get_blocks_by_label()
158

159

160
def _part_counts(page: PageData, result: ClassificationResult) -> list[Block]:
1✔
161
    return [
×
162
        e
163
        for e in page.blocks
164
        if result.get_label(e) == "part_count" and not result.is_removed(e)
165
    ]
166

167

168
def _print_label_counts(page: PageData, result: ClassificationResult) -> None:
1✔
169
    label_counts = defaultdict(int)
×
NEW
170
    for e in page.blocks:
×
171
        label = result.get_label(e) if result.get_label(e) else "<unknown>"
×
172
        label_counts[label] += 1
×
173

174
    # TODO The following logging shows "defaultdict(<class 'int'>,..." figure
175
    # out how to avoid that.
176
    log.info(f"Label counts: {label_counts}")
×
177

178

179
@pytest.mark.skip(reason="Not working yet.")
1✔
180
class TestClassifierRules:
1✔
181
    """End-to-end rules that must hold on real pages after classification."""
182

183
    @pytest.mark.parametrize(
1✔
184
        "fixture_file",
185
        [f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
186
    )
187
    def test_parts_list_contains_at_least_one_part_image(
1✔
188
        self, fixture_file: str
189
    ) -> None:
190
        """Every labeled parts list should include at least one part image inside its bbox.
191

192
        This test runs on all JSON fixtures in the fixtures/ directory.
193
        """
194

195
        fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)
×
196
        page: PageData = PageData.from_json(fixture_path.read_text())  # type: ignore[assignment]
×
197

198
        # Run the full classification pipeline on the page
199
        result = classify_elements(page)
×
200

201
        classified = ClassifiedPage(page, result)
×
202
        classified.print_summary()
×
203

204
        parts_lists = classified.parts_lists()
×
205
        part_images = classified.part_images()
×
206
        part_counts = classified.part_counts()
×
207

208
        # Debug: show all part_image labeled elements including deleted ones
209
        all_part_images = classified.elements_by_label(
×
210
            "part_image", include_deleted=True
211
        )
212
        log.info(
×
213
            f"Total on page: {len(parts_lists)} parts_lists, {len(part_images)} part_images (non-deleted), {len(all_part_images)} total part_images, {len(part_counts)} part_counts"
214
        )
215
        if len(all_part_images) != len(part_images):
×
216
            deleted_count = len(all_part_images) - len(part_images)
×
217
            log.warning(
×
218
                f"  WARNING: {deleted_count} part_images are DELETED on this page"
219
            )
220
            for img in all_part_images:
×
221
                if result.is_removed(img):
×
222
                    # Check if it's inside any parts_list
223
                    inside_any = any(
×
224
                        img.bbox.fully_inside(pl.bbox) for pl in parts_lists
225
                    )
226
                    location = (
×
227
                        "inside a parts_list"
228
                        if inside_any
229
                        else "outside all parts_lists"
230
                    )
231
                    log.warning(
×
232
                        f"    - Deleted PartImage id:{img.id} bbox:{img.bbox} ({location})"
233
                    )
234

235
        for parts_list in parts_lists:
×
236
            part_images_inside = classified.children_of(parts_list, label="part_image")
×
237
            part_counts_inside = classified.children_of(parts_list, label="part_count")
×
238

239
            # Also get ALL part_images (including deleted) to check for deletion bugs
240
            all_part_images_inside = []
×
NEW
241
            for elem in page.blocks:
×
242
                if result.get_label(elem) == "part_image" and elem.bbox.fully_inside(
×
243
                    parts_list.bbox
244
                ):
245
                    all_part_images_inside.append(elem)
×
246

247
            log.info(
×
248
                f"{fixture_file} PartsList id:{parts_list.id} bbox:{parts_list.bbox} contains:"
249
            )
250
            for img in part_images_inside:
×
251
                log.info(f" - PartImage id:{img.id} bbox:{img.bbox}")
×
252
            for count in part_counts_inside:
×
253
                count_text = count.text if isinstance(count, Text) else ""
×
254
                log.info(
×
255
                    f" - PartCount id:{count.id} text:{count_text} bbox:{count.bbox}"
256
                )
257

258
            # Log deleted part_images if any
259
            deleted_images = [
×
260
                img for img in all_part_images_inside if result.is_removed(img)
261
            ]
262
            if deleted_images:
×
263
                log.warning(
×
264
                    f"  WARNING: {len(deleted_images)} part_images DELETED inside parts_list {parts_list.id}:"
265
                )
266
                for img in deleted_images:
×
267
                    log.warning(
×
268
                        f"    - PartImage id:{img.id} bbox:{img.bbox} [DELETED]"
269
                    )
270

271
            # Debug: log all part images to see why they're not inside
272
            if len(part_images_inside) == 0:
×
273
                log.info("  DEBUG: All part_images on page:")
×
274
                for img in part_images:
×
275
                    log.info(
×
276
                        f"  - PartImage id:{img.id} bbox:{img.bbox} inside:{img.bbox.fully_inside(parts_list.bbox)}"
277
                    )
278

279
            # Each parts_list must contain at least one part_image fully inside its bbox
280
            assert len(part_images_inside) >= 1, (
×
281
                f"Parts list {parts_list.id} in {fixture_file} should contain at least one part image"
282
            )
283

284
            # No part_images inside a parts_list should be deleted
285
            assert len(deleted_images) == 0, (
×
286
                f"Parts list {parts_list.id} in {fixture_file} has {len(deleted_images)} "
287
                f"deleted part_images inside it (should be 0)"
288
            )
289

290
            # Each parts_list must contain the same number of part_counts as
291
            # part_images inside it
292
            assert len(part_counts_inside) == len(part_images_inside), (
×
293
                f"PartsList id:{parts_list.id} in {fixture_file} should contain "
294
                f"{len(part_images_inside)} PartCounts, found {len(part_counts_inside)}"
295
            )
296

297
    @pytest.mark.parametrize(
1✔
298
        "fixture_file",
299
        [f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
300
    )
301
    def test_parts_lists_do_not_overlap(self, fixture_file: str) -> None:
1✔
302
        """No two parts lists should overlap.
303

304
        Parts lists represent distinct areas of the page and should not
305
        have overlapping bounding boxes.
306
        """
307
        fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)
×
308
        page: PageData = PageData.from_json(fixture_path.read_text())  # type: ignore[assignment]
×
309

310
        # Run the full classification pipeline on the page
311
        result = classify_elements(page)
×
312

313
        classified = ClassifiedPage(page, result)
×
314
        parts_lists = classified.parts_lists()
×
315

316
        # Check all pairs of parts lists for overlap
317
        for i, parts_list_a in enumerate(parts_lists):
×
318
            for parts_list_b in parts_lists[i + 1 :]:
×
319
                assert not parts_list_a.bbox.overlaps(parts_list_b.bbox), (
×
320
                    f"Parts lists {parts_list_a.id} (bbox:{parts_list_a.bbox}) and "
321
                    f"{parts_list_b.id} (bbox:{parts_list_b.bbox}) in {fixture_file} overlap"
322
                )
323

324
    @pytest.mark.parametrize(
1✔
325
        "fixture_file",
326
        [f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
327
    )
328
    def test_each_part_image_is_inside_a_parts_list(self, fixture_file: str) -> None:
1✔
329
        """Each part image must be inside at least one parts list.
330

331
        Every part_image should be contained within a parts_list's bounding box.
332
        """
333
        fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)
×
334
        page: PageData = PageData.from_json(fixture_path.read_text())  # type: ignore[assignment]
×
335

336
        # Run the full classification pipeline on the page
337
        result = classify_elements(page)
×
338

339
        classified = ClassifiedPage(page, result)
×
340
        parts_lists = classified.parts_lists()
×
341
        part_images = classified.part_images()
×
342

343
        for part_image in part_images:
×
344
            # Check if this part_image is inside at least one parts_list
345
            inside_any_parts_list = any(
×
346
                part_image.bbox.fully_inside(pl.bbox) for pl in parts_lists
347
            )
348

349
            assert inside_any_parts_list, (
×
350
                f"Part image {part_image.id} (bbox:{part_image.bbox}) in {fixture_file} "
351
                f"is not inside any parts_list"
352
            )
353

354
    @pytest.mark.parametrize(
1✔
355
        "fixture_file",
356
        [f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
357
    )
358
    def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:
1✔
359
        """No element with a label should be marked as deleted.
360

361
        If an element has been classified with a label, it should not be deleted.
362
        This ensures that the classification and deletion logic don't conflict.
363
        """
364
        fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)
×
365
        page: PageData = PageData.from_json(fixture_path.read_text())  # type: ignore[assignment]
×
366

367
        # Run the full classification pipeline on the page
368
        result = classify_elements(page)
×
369

370
        # Find all elements that are both labeled and deleted
371
        labeled_and_deleted = []
×
NEW
372
        for elem in page.blocks:
×
373
            if result.get_label(elem) is not None and result.is_removed(elem):
×
374
                labeled_and_deleted.append(elem)
×
375

376
        if labeled_and_deleted:
×
377
            log.error(
×
378
                f"Found {len(labeled_and_deleted)} labeled elements that are deleted:"
379
            )
380
            for elem in labeled_and_deleted:
×
381
                log.error(
×
382
                    f"  - {result.get_label(elem)} id:{elem.id} bbox:{elem.bbox} [DELETED]"
383
                )
384

385
        assert len(labeled_and_deleted) == 0, (
×
386
            f"Found {len(labeled_and_deleted)} labeled elements that are deleted in {fixture_file}. "
387
            f"Labeled elements should not be deleted."
388
        )
389

390
    @pytest.mark.parametrize(
1✔
391
        "fixture_file",
392
        [f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
393
    )
394
    def test_each_element_has_at_most_one_winner(self, fixture_file: str) -> None:
1✔
395
        """Each element should have at most one winner candidate across all labels.
396

397
        An element can have multiple candidates across different labels, but only
398
        one of them should be marked as a winner. This ensures classification
399
        decisions are unambiguous.
400
        """
401
        fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)
×
402
        page: PageData = PageData.from_json(fixture_path.read_text())  # type: ignore[assignment]
×
403

404
        # Run the full classification pipeline on the page
405
        result = classify_elements(page)
×
406

407
        # Track which blocks have won, and for which label
NEW
408
        block_to_winning_label: dict[int, str] = {}
×
409

410
        # Check all candidates across all labels
411
        all_candidates = result.get_all_candidates()
×
412
        for label, candidates in all_candidates.items():
×
413
            for candidate in candidates:
×
414
                if not candidate.is_winner:
×
415
                    continue
×
416

417
                # Skip synthetic candidates (no source block)
NEW
418
                if candidate.source_block is None:
×
UNCOV
419
                    continue
×
420

NEW
421
                block_id = candidate.source_block.id
×
422

423
                # Check if this block already has a winner
NEW
424
                if block_id in block_to_winning_label:
×
NEW
425
                    existing_label = block_to_winning_label[block_id]
×
UNCOV
426
                    pytest.fail(
×
427
                        f"Block {block_id} in {fixture_file} has multiple winner candidates: "
428
                        f"'{existing_label}' and '{label}'. Each block should have at most one winner."
429
                    )
430

NEW
431
                block_to_winning_label[block_id] = label
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc