• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

bramp / build-along / 20258147097

16 Dec 2025 05:59AM UTC coverage: 89.094% (+0.4%) from 88.668%
20258147097

push

github

bramp
chore: Minor cleanups

- Add TODO comment about BatchClassificationResult naming
- Remove completed testing improvements from TODO.md

12818 of 14387 relevant lines covered (89.09%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.39
/src/build_a_long/pdf_extract/extractor/testing_utils.py
1
from __future__ import annotations
1✔
2

3
from typing import Any
1✔
4
from unittest.mock import MagicMock
1✔
5

6
import pymupdf
1✔
7

8
from build_a_long.pdf_extract.extractor.pymupdf_types import (
1✔
9
    ImageInfoDict,
10
    RectLikeTuple,
11
    TexttraceChar,
12
    TexttraceSpanDict,
13
)
14

15

16
class PageBuilder:
1✔
17
    """Builder for creating mock PyMuPDF pages for testing."""
18

19
    def __init__(self) -> None:
1✔
20
        self._text_spans: list[dict[str, Any]] = []
1✔
21
        self._images: list[dict[str, Any]] = []
1✔
22
        self._drawings: list[dict[str, Any]] = []
1✔
23
        self._bboxlog: list[tuple[str, tuple[float, float, float, float]]] = []
1✔
24

25
    def add_text(
1✔
26
        self,
27
        text: str,
28
        bbox: RectLikeTuple,
29
        font: str = "Arial",
30
        size: float = 12.0,
31
        seqno: int | None = None,
32
    ) -> PageBuilder:
33
        """Add a text span to the page."""
34
        self._text_spans.append(
1✔
35
            {
36
                "text": text,
37
                "bbox": bbox,
38
                "font": font,
39
                "size": size,
40
                "seqno": seqno if seqno is not None else len(self._text_spans),
41
            }
42
        )
43
        self._bboxlog.append(("fill-text", bbox))
1✔
44
        return self
1✔
45

46
    def add_image(
1✔
47
        self,
48
        bbox: RectLikeTuple,
49
        xref: int,
50
        number: int,
51
        width: int = 100,
52
        height: int = 100,
53
        image_id: str | None = None,
54
    ) -> PageBuilder:
55
        """Add an image to the page."""
56
        self._images.append(
1✔
57
            {
58
                "bbox": bbox,
59
                "xref": xref,
60
                "number": number,
61
                "width": width,
62
                "height": height,
63
                "image_id": image_id or f"Im{xref}",
64
            }
65
        )
66
        self._bboxlog.append(("fill-image", bbox))
1✔
67
        return self
1✔
68

69
    def add_drawing(
1✔
70
        self,
71
        bbox: RectLikeTuple,
72
        level: int = 0,
73
        seqno: int | None = None,
74
    ) -> PageBuilder:
75
        """Add a drawing to the page."""
76
        self._drawings.append(
1✔
77
            {
78
                "bbox": bbox,
79
                "level": level,
80
                "seqno": seqno,
81
            }
82
        )
83
        # Drawings often don't appear in bboxlog exactly as "fill-path",
84
        # but for matching logic we might need them. Adding as fill-path for now.
85
        self._bboxlog.append(("fill-path", bbox))
1✔
86
        return self
1✔
87

88
    def _make_texttrace(self) -> list[TexttraceSpanDict]:
1✔
89
        """Generate texttrace structure from added text spans."""
90
        spans: list[TexttraceSpanDict] = []
1✔
91
        for span in self._text_spans:
1✔
92
            bbox = span["bbox"]
1✔
93
            text = span["text"]
1✔
94
            # Generate fake chars
95
            chars: list[TexttraceChar] = [
1✔
96
                (ord(c), i, (bbox[0], bbox[3]), bbox) for i, c in enumerate(text)
97
            ]
98
            spans.append(
1✔
99
                {
100
                    "bbox": bbox,
101
                    "font": span["font"],
102
                    "size": span["size"],
103
                    "seqno": span["seqno"],
104
                    "chars": chars,
105
                    # "text": text, # texttrace doesn't have "text" field directly on span?
106
                    # Actually, TexttraceSpanDict definition in pymupdf_types.py implies it does NOT have 'text'.
107
                    # But our make_texttrace_span helper in tests put it there? No, it put 'chars'.
108
                    # Wait, Extractor._extract_text_blocks_from_texttrace uses span.get("bbox") and Text.from_texttrace_span.
109
                    # Text.from_texttrace_span likely reconstructs text from chars.
110
                }
111
            )
112
        return spans
1✔
113

114
    def _make_rawdict(self) -> dict[str, Any]:
1✔
115
        """Generate rawdict structure from added text spans."""
116
        lines = []
1✔
117
        for span in self._text_spans:
1✔
118
            bbox = span["bbox"]
1✔
119
            text = span["text"]
1✔
120
            chars = [
1✔
121
                {
122
                    "c": c,
123
                    "bbox": (
124
                        bbox[0] + i * 4,
125
                        bbox[1],
126
                        bbox[0] + (i + 1) * 4,
127
                        bbox[3],
128
                    ),
129
                }
130
                for i, c in enumerate(text)
131
            ]
132
            lines.append(
1✔
133
                {
134
                    "spans": [
135
                        {
136
                            "bbox": bbox,
137
                            "font": span["font"],
138
                            "size": span["size"],
139
                            "chars": chars,
140
                            "origin": (bbox[0], bbox[3]),
141
                        }
142
                    ],
143
                    "bbox": bbox,
144
                }
145
            )
146

147
        return {
1✔
148
            "blocks": [
149
                {
150
                    "type": 0,  # text block
151
                    "lines": lines,
152
                }
153
            ]
154
        }
155

156
    def _make_image_info(self) -> list[ImageInfoDict]:
1✔
157
        """Generate get_image_info structure."""
158
        return [
1✔
159
            {
160
                "number": img["number"],
161
                "bbox": img["bbox"],
162
                "width": img["width"],
163
                "height": img["height"],
164
                "colorspace": 3,
165
                "xres": 96,
166
                "yres": 96,
167
                "bpc": 8,
168
                "size": 1000,
169
                "transform": (1.0, 0.0, 0.0, 1.0, img["bbox"][0], img["bbox"][1]),
170
                "xref": img["xref"],
171
                "digest": b"fake_digest",  # Added because ImageInfoDict might expect it
172
            }
173
            for img in self._images
174
        ]
175

176
    def _make_images(self) -> list[tuple]:
1✔
177
        """Generate get_images structure."""
178
        return [
1✔
179
            (
180
                img["xref"],
181
                0,  # smask
182
                img["width"],
183
                img["height"],
184
                8,  # bpc
185
                3,  # colorspace
186
                "",
187
                img["image_id"],
188
                "DCTDecode",
189
                0,
190
            )
191
            for img in self._images
192
        ]
193

194
    def _make_drawings(self) -> list[dict[str, Any]]:
1✔
195
        """Generate get_drawings structure."""
196
        return [
1✔
197
            {
198
                "rect": pymupdf.Rect(d["bbox"]),
199
                "level": d["level"],
200
                "seqno": d["seqno"],
201
                "items": [("re", pymupdf.Rect(d["bbox"]))],  # Dummy item
202
            }
203
            for d in self._drawings
204
        ]
205

206
    def build_mock_page(self) -> MagicMock:
1✔
207
        """Build and return a configured MagicMock page."""
208
        mock_page = MagicMock()
1✔
209
        mock_page.rect = MagicMock(x0=0, y0=0, x1=612, y1=792)
1✔
210
        mock_page.transformation_matrix = pymupdf.Identity
1✔
211

212
        # Configure get_text
213
        rawdict = self._make_rawdict()
1✔
214
        texttrace = self._make_texttrace()
1✔
215

216
        def get_text_side_effect(option="text", flags=None):
1✔
217
            if option == "dict":  # Used by Extractor when use_rawdict=True
1✔
218
                return rawdict
1✔
219
            return ""
×
220

221
        mock_page.get_text.side_effect = get_text_side_effect
1✔
222
        mock_page.get_texttrace.return_value = texttrace
1✔
223

224
        # Configure image methods
225
        mock_page.get_bboxlog.return_value = self._bboxlog
1✔
226
        mock_page.get_image_info.return_value = self._make_image_info()
1✔
227
        mock_page.get_images.return_value = self._make_images()
1✔
228

229
        # Configure drawings
230
        mock_page.get_drawings.return_value = self._make_drawings()
1✔
231

232
        return mock_page
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc