• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 19060277498

04 Nov 2025 06:46AM UTC coverage: 84.229% (-0.02%) from 84.251%
19060277498

push

github

bramp
Chore: Enabled some more lint checks.

src/build_a_long/downloader/legocom.py # modified:  src/build_a_long/downloader/metadata.py # modified:  src/build_a_long/pdf_extract/analyze_classifier.py # modified:
src/build_a_long/pdf_extract/classifier/classification_result.py # modified:  src/build_a_long/pdf_extract/classifier/classifier.py # modified:
src/build_a_long/pdf_extract/classifier/classifier_golden_test.py # modified:  src/build_a_long/pdf_extract/classifier/classifier_rules_test.py # modified:
src/build_a_long/pdf_extract/classifier/classifier_test.py # modified:  src/build_a_long/pdf_extract/classifier/hierarchy_builder.py # modified:
src/build_a_long/pdf_extract/classifier/hierarchy_builder_test.py # modified:  src/build_a_long/pdf_extract/classifier/label_classifier.py # modified:
src/build_a_long/pdf_extract/classifier/lego_page_builder.py # modified:  src/build_a_long/pdf_extract/classifier/lego_page_builder_test.py # modified:
src/build_a_long/pdf_extract/classifier/page_number_classifier.py # modified:  src/build_a_long/pdf_extract/classifier/part_count_classifier.py # modified:
src/build_a_long/pdf_extract/classifier/parts_image_classifier.py # modified:  src/build_a_long/pdf_extract/classifier/parts_list_classifier.py # modified:
src/build_a_long/pdf_extract/classifier/step_classifier.py # modified:  src/build_a_long/pdf_extract/classifier/step_number_classifier.py # modified:
src/build_a_long/pdf_extract/classifier/text_extractors.py # modified:  src/build_a_long/pdf_extract/extractor/bbox.py # modified:  src/build_a_long/pdf_extract/extractor/extractor.py # modified:
src/build_a_long/pdf_extract/extractor/hierarchy.py # modified:  src/build_a_long/pdf_extract/extractor/lego_page_elements.py # modified:  src/build_a_long/pdf_extract/extractor/page_data_json_test.py #
modified:  src/build_a_long/pdf_extract/extractor/page_elements.py # modified:  src/build_a_long/pdf_extract/extractor/pymupdf_types.py # modified:  src/build_a_long/pdf... (continued)

164 of 175 new or added lines in 28 files covered. (93.71%)

255 existing lines in 12 files now uncovered.

3573 of 4242 relevant lines covered (84.23%)

0.84 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

22.3
/src/build_a_long/pdf_extract/classifier/classifier_rules_test.py
1
"""Rule-based tests over real fixtures for the PDF element classifier.
2

3
This suite validates high-level invariants that must hold after classification.
4

5
Rules covered:
6
- Every parts list must contain at least one part image inside it.
7
- No two parts lists overlap.
8
- Each part image is inside a parts list.
9

10
Real fixture(s) live under this package's fixtures/ directory.
11
"""
12

13
import logging
1✔
14
from collections import defaultdict
1✔
15
from pathlib import Path
1✔
16

17
import pytest
1✔
18

19
from build_a_long.pdf_extract.classifier import ClassificationResult, classify_elements
1✔
20
from build_a_long.pdf_extract.extractor import PageData
1✔
21
from build_a_long.pdf_extract.extractor.page_elements import Element, Text
1✔
22

23
log = logging.getLogger(__name__)
1✔
24

25
# TODO A lot of the methods in ClassifiedPage overlap with ClassificationResult
26

27

28
class ClassifiedPage:
1✔
29
    """Wrapper around PageData providing convenient access to classified elements.
30

31
    This class provides helper methods to query elements by label type and
32
    supports hierarchical queries (e.g., finding children inside parent bboxes).
33
    Results are cached for efficiency.
34
    """
35

36
    def __init__(self, page: PageData, result: ClassificationResult):
1✔
37
        """Initialize with a classified PageData and its result.
38

39
        Args:
40
            page: PageData that has been run through classify_elements()
41
            result: The ClassificationResult for this page
42
        """
UNCOV
43
        self.page = page
×
44
        self.result = result
×
NEW
45
        self._cache: dict[str, list[Element]] = {}
×
46

47
    def elements_by_label(
1✔
48
        self, label: str, include_deleted: bool = False
49
    ) -> list[Element]:
50
        """Get all elements with the given label.
51

52
        Args:
53
            label: The label to filter by
54
            include_deleted: Whether to include deleted elements
55

56
        Returns:
57
            List of elements with matching label
58
        """
UNCOV
59
        cache_key = f"{label}:deleted={include_deleted}"
×
60
        if cache_key not in self._cache:
×
61
            if include_deleted:
×
62
                self._cache[cache_key] = [
×
63
                    e for e in self.page.elements if self.result.get_label(e) == label
64
                ]
65
            else:
UNCOV
66
                self._cache[cache_key] = [
×
67
                    e
68
                    for e in self.page.elements
69
                    if self.result.get_label(e) == label
70
                    and not self.result.is_removed(e)
71
                ]
UNCOV
72
        return self._cache[cache_key]
×
73

74
    def parts_lists(self) -> list[Element]:
1✔
75
        """Get all non-deleted parts_list elements."""
UNCOV
76
        return self.elements_by_label("parts_list")
×
77

78
    def part_images(self) -> list[Element]:
1✔
79
        """Get all non-deleted part_image elements."""
UNCOV
80
        return self.elements_by_label("part_image")
×
81

82
    def part_counts(self) -> list[Element]:
1✔
83
        """Get all non-deleted part_count elements."""
UNCOV
84
        return self.elements_by_label("part_count")
×
85

86
    def step_numbers(self) -> list[Element]:
1✔
87
        """Get all non-deleted step_number elements."""
UNCOV
88
        return self.elements_by_label("step_number")
×
89

90
    def children_of(self, parent: Element, label: str | None = None) -> list[Element]:
1✔
91
        """Return all non-deleted elements spatially contained within a parent element.
92

93
        Note: This uses bbox containment, not ElementTree hierarchy, because the hierarchy
94
        is based on "smallest containing bbox" which means there may be intermediate
95
        unlabeled elements between a parent and its logical children. For validation
96
        rules about spatial containment, bbox checking is more appropriate.
97

98
        Args:
99
            parent: The parent element to search within
100
            label: Optional label filter (e.g., "part_image")
101

102
        Returns:
103
            List of non-deleted Elements matching the label (if specified) that
104
            are fully contained within the parent's bbox
105
        """
106
        # Use spatial containment, not hierarchy
UNCOV
107
        result = []
×
108
        for elem in self.page.elements:
×
109
            if id(elem) in self.result._removal_reasons:
×
110
                continue
×
111
            if label is not None and self.result.get_label(elem) != label:
×
112
                continue
×
113
            if elem.bbox.fully_inside(parent.bbox):
×
114
                result.append(elem)
×
115
        return result
×
116

117
    def print_summary(self, logger: logging.Logger | None = None) -> None:
1✔
118
        """Log a summary of labeled elements.
119

120
        Args:
121
            logger: Logger to use (defaults to module logger)
122
        """
UNCOV
123
        logger = logger or log
×
124
        label_counts = defaultdict(int)
×
125
        for e in self.page.elements:
×
126
            label = (
×
127
                self.result.get_label(e) if self.result.get_label(e) else "<unknown>"
128
            )
UNCOV
129
            label_counts[label] += 1
×
130

UNCOV
131
        logger.info(f"Label counts: {dict(label_counts)}")
×
132

133

134
# TODO Replace this with just results.get_elements_by_label()
135

136

137
def _parts_lists(page: PageData, result: ClassificationResult) -> list[Element]:
1✔
UNCOV
138
    return [
×
139
        e
140
        for e in page.elements
141
        if result.get_label(e) == "parts_list" and not result.is_removed(e)
142
    ]
143

144

145
# TODO Replace this with just results.get_elements_by_label()
146

147

148
def _part_images(page: PageData, result: ClassificationResult) -> list[Element]:
1✔
UNCOV
149
    return [
×
150
        e
151
        for e in page.elements
152
        if result.get_label(e) == "part_image" and not result.is_removed(e)
153
    ]
154

155

156
# TODO Replace this with just results.get_elements_by_label()
157

158

159
def _part_counts(page: PageData, result: ClassificationResult) -> list[Element]:
1✔
UNCOV
160
    return [
×
161
        e
162
        for e in page.elements
163
        if result.get_label(e) == "part_count" and not result.is_removed(e)
164
    ]
165

166

167
def _print_label_counts(page: PageData, result: ClassificationResult) -> None:
1✔
UNCOV
168
    label_counts = defaultdict(int)
×
169
    for e in page.elements:
×
170
        label = result.get_label(e) if result.get_label(e) else "<unknown>"
×
171
        label_counts[label] += 1
×
172

173
    # TODO The following logging shows "defaultdict(<class 'int'>,..." figure
174
    # out how to avoid that.
UNCOV
175
    log.info(f"Label counts: {label_counts}")
×
176

177

178
@pytest.mark.skip(reason="Not working yet.")
1✔
179
class TestClassifierRules:
1✔
180
    """End-to-end rules that must hold on real pages after classification."""
181

182
    @pytest.mark.parametrize(
1✔
183
        "fixture_file",
184
        [f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
185
    )
186
    def test_parts_list_contains_at_least_one_part_image(
1✔
187
        self, fixture_file: str
188
    ) -> None:
189
        """Every labeled parts list should include at least one part image inside its bbox.
190

191
        This test runs on all JSON fixtures in the fixtures/ directory.
192
        """
193

UNCOV
194
        fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)
×
195
        page: PageData = PageData.from_json(fixture_path.read_text())  # type: ignore[assignment]
×
196

197
        # Run the full classification pipeline on the page
UNCOV
198
        result = classify_elements(page)
×
199

UNCOV
200
        classified = ClassifiedPage(page, result)
×
201
        classified.print_summary()
×
202

UNCOV
203
        parts_lists = classified.parts_lists()
×
204
        part_images = classified.part_images()
×
205
        part_counts = classified.part_counts()
×
206

207
        # Debug: show all part_image labeled elements including deleted ones
UNCOV
208
        all_part_images = classified.elements_by_label(
×
209
            "part_image", include_deleted=True
210
        )
UNCOV
211
        log.info(
×
212
            f"Total on page: {len(parts_lists)} parts_lists, {len(part_images)} part_images (non-deleted), {len(all_part_images)} total part_images, {len(part_counts)} part_counts"
213
        )
UNCOV
214
        if len(all_part_images) != len(part_images):
×
215
            deleted_count = len(all_part_images) - len(part_images)
×
216
            log.warning(
×
217
                f"  WARNING: {deleted_count} part_images are DELETED on this page"
218
            )
UNCOV
219
            for img in all_part_images:
×
220
                if result.is_removed(img):
×
221
                    # Check if it's inside any parts_list
UNCOV
222
                    inside_any = any(
×
223
                        img.bbox.fully_inside(pl.bbox) for pl in parts_lists
224
                    )
UNCOV
225
                    location = (
×
226
                        "inside a parts_list"
227
                        if inside_any
228
                        else "outside all parts_lists"
229
                    )
UNCOV
230
                    log.warning(
×
231
                        f"    - Deleted PartImage id:{img.id} bbox:{img.bbox} ({location})"
232
                    )
233

UNCOV
234
        for parts_list in parts_lists:
×
235
            part_images_inside = classified.children_of(parts_list, label="part_image")
×
236
            part_counts_inside = classified.children_of(parts_list, label="part_count")
×
237

238
            # Also get ALL part_images (including deleted) to check for deletion bugs
UNCOV
239
            all_part_images_inside = []
×
240
            for elem in page.elements:
×
241
                if result.get_label(elem) == "part_image" and elem.bbox.fully_inside(
×
242
                    parts_list.bbox
243
                ):
UNCOV
244
                    all_part_images_inside.append(elem)
×
245

UNCOV
246
            log.info(
×
247
                f"{fixture_file} PartsList id:{parts_list.id} bbox:{parts_list.bbox} contains:"
248
            )
UNCOV
249
            for img in part_images_inside:
×
250
                log.info(f" - PartImage id:{img.id} bbox:{img.bbox}")
×
251
            for count in part_counts_inside:
×
252
                count_text = count.text if isinstance(count, Text) else ""
×
253
                log.info(
×
254
                    f" - PartCount id:{count.id} text:{count_text} bbox:{count.bbox}"
255
                )
256

257
            # Log deleted part_images if any
UNCOV
258
            deleted_images = [
×
259
                img for img in all_part_images_inside if result.is_removed(img)
260
            ]
UNCOV
261
            if deleted_images:
×
262
                log.warning(
×
263
                    f"  WARNING: {len(deleted_images)} part_images DELETED inside parts_list {parts_list.id}:"
264
                )
UNCOV
265
                for img in deleted_images:
×
266
                    log.warning(
×
267
                        f"    - PartImage id:{img.id} bbox:{img.bbox} [DELETED]"
268
                    )
269

270
            # Debug: log all part images to see why they're not inside
UNCOV
271
            if len(part_images_inside) == 0:
×
272
                log.info("  DEBUG: All part_images on page:")
×
273
                for img in part_images:
×
274
                    log.info(
×
275
                        f"  - PartImage id:{img.id} bbox:{img.bbox} inside:{img.bbox.fully_inside(parts_list.bbox)}"
276
                    )
277

278
            # Each parts_list must contain at least one part_image fully inside its bbox
UNCOV
279
            assert len(part_images_inside) >= 1, (
×
280
                f"Parts list {parts_list.id} in {fixture_file} should contain at least one part image"
281
            )
282

283
            # No part_images inside a parts_list should be deleted
UNCOV
284
            assert len(deleted_images) == 0, (
×
285
                f"Parts list {parts_list.id} in {fixture_file} has {len(deleted_images)} "
286
                f"deleted part_images inside it (should be 0)"
287
            )
288

289
            # Each parts_list must contain the same number of part_counts as
290
            # part_images inside it
UNCOV
291
            assert len(part_counts_inside) == len(part_images_inside), (
×
292
                f"PartsList id:{parts_list.id} in {fixture_file} should contain "
293
                f"{len(part_images_inside)} PartCounts, found {len(part_counts_inside)}"
294
            )
295

296
    @pytest.mark.parametrize(
1✔
297
        "fixture_file",
298
        [f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
299
    )
300
    def test_parts_lists_do_not_overlap(self, fixture_file: str) -> None:
1✔
301
        """No two parts lists should overlap.
302

303
        Parts lists represent distinct areas of the page and should not
304
        have overlapping bounding boxes.
305
        """
UNCOV
306
        fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)
×
307
        page: PageData = PageData.from_json(fixture_path.read_text())  # type: ignore[assignment]
×
308

309
        # Run the full classification pipeline on the page
UNCOV
310
        result = classify_elements(page)
×
311

UNCOV
312
        classified = ClassifiedPage(page, result)
×
313
        parts_lists = classified.parts_lists()
×
314

315
        # Check all pairs of parts lists for overlap
UNCOV
316
        for i, parts_list_a in enumerate(parts_lists):
×
317
            for parts_list_b in parts_lists[i + 1 :]:
×
318
                assert not parts_list_a.bbox.overlaps(parts_list_b.bbox), (
×
319
                    f"Parts lists {parts_list_a.id} (bbox:{parts_list_a.bbox}) and "
320
                    f"{parts_list_b.id} (bbox:{parts_list_b.bbox}) in {fixture_file} overlap"
321
                )
322

323
    @pytest.mark.parametrize(
1✔
324
        "fixture_file",
325
        [f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
326
    )
327
    def test_each_part_image_is_inside_a_parts_list(self, fixture_file: str) -> None:
1✔
328
        """Each part image must be inside at least one parts list.
329

330
        Every part_image should be contained within a parts_list's bounding box.
331
        """
UNCOV
332
        fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)
×
333
        page: PageData = PageData.from_json(fixture_path.read_text())  # type: ignore[assignment]
×
334

335
        # Run the full classification pipeline on the page
UNCOV
336
        result = classify_elements(page)
×
337

UNCOV
338
        classified = ClassifiedPage(page, result)
×
339
        parts_lists = classified.parts_lists()
×
340
        part_images = classified.part_images()
×
341

UNCOV
342
        for part_image in part_images:
×
343
            # Check if this part_image is inside at least one parts_list
UNCOV
344
            inside_any_parts_list = any(
×
345
                part_image.bbox.fully_inside(pl.bbox) for pl in parts_lists
346
            )
347

UNCOV
348
            assert inside_any_parts_list, (
×
349
                f"Part image {part_image.id} (bbox:{part_image.bbox}) in {fixture_file} "
350
                f"is not inside any parts_list"
351
            )
352

353
    @pytest.mark.parametrize(
1✔
354
        "fixture_file",
355
        [f.name for f in (Path(__file__).with_name("fixtures")).glob("*.json")],
356
    )
357
    def test_no_labeled_element_is_deleted(self, fixture_file: str) -> None:
1✔
358
        """No element with a label should be marked as deleted.
359

360
        If an element has been classified with a label, it should not be deleted.
361
        This ensures that the classification and deletion logic don't conflict.
362
        """
UNCOV
363
        fixture_path = Path(__file__).with_name("fixtures").joinpath(fixture_file)
×
364
        page: PageData = PageData.from_json(fixture_path.read_text())  # type: ignore[assignment]
×
365

366
        # Run the full classification pipeline on the page
UNCOV
367
        result = classify_elements(page)
×
368

369
        # Find all elements that are both labeled and deleted
UNCOV
370
        labeled_and_deleted = []
×
371
        for elem in page.elements:
×
372
            if result.get_label(elem) is not None and result.is_removed(elem):
×
373
                labeled_and_deleted.append(elem)
×
374

UNCOV
375
        if labeled_and_deleted:
×
376
            log.error(
×
377
                f"Found {len(labeled_and_deleted)} labeled elements that are deleted:"
378
            )
UNCOV
379
            for elem in labeled_and_deleted:
×
380
                log.error(
×
381
                    f"  - {result.get_label(elem)} id:{elem.id} bbox:{elem.bbox} [DELETED]"
382
                )
383

UNCOV
384
        assert len(labeled_and_deleted) == 0, (
×
385
            f"Found {len(labeled_and_deleted)} labeled elements that are deleted in {fixture_file}. "
386
            f"Labeled elements should not be deleted."
387
        )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc