20258147097

Committed 16 Dec 2025 05:59AM UTC coverage: 89.094% (+0.4%) from 88.668%

Build # 20258147097

Build Type

push

github

Committed by

bramp

Commit Message

chore: Minor cleanups

- Add TODO comment about BatchClassificationResult naming
- Remove completed testing improvements from TODO.md

Run Details

12818 of 14387 relevant lines covered (89.09%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.39

/src/build_a_long/pdf_extract/extractor/testing_utils.py

from __future__ import annotations

from typing import Any
from unittest.mock import MagicMock

import pymupdf

from build_a_long.pdf_extract.extractor.pymupdf_types import (
    ImageInfoDict,
    RectLikeTuple,
    TexttraceChar,
    TexttraceSpanDict,
)


class PageBuilder:
    """Builder for creating mock PyMuPDF pages for testing."""

    def __init__(self) -> None:
        self._text_spans: list[dict[str, Any]] = []
        self._images: list[dict[str, Any]] = []
        self._drawings: list[dict[str, Any]] = []
        self._bboxlog: list[tuple[str, tuple[float, float, float, float]]] = []

    def add_text(
        self,
        text: str,
        bbox: RectLikeTuple,
        font: str = "Arial",
        size: float = 12.0,
        seqno: int | None = None,
    ) -> PageBuilder:
        """Add a text span to the page."""
        self._text_spans.append(
            {
                "text": text,
                "bbox": bbox,
                "font": font,
                "size": size,
                "seqno": seqno if seqno is not None else len(self._text_spans),
            }
        )
        self._bboxlog.append(("fill-text", bbox))
        return self

    def add_image(
        self,
        bbox: RectLikeTuple,
        xref: int,
        number: int,
        width: int = 100,
        height: int = 100,
        image_id: str | None = None,
    ) -> PageBuilder:
        """Add an image to the page."""
        self._images.append(
            {
                "bbox": bbox,
                "xref": xref,
                "number": number,
                "width": width,
                "height": height,
                "image_id": image_id or f"Im{xref}",
            }
        )
        self._bboxlog.append(("fill-image", bbox))
        return self

    def add_drawing(
        self,
        bbox: RectLikeTuple,
        level: int = 0,
        seqno: int | None = None,
    ) -> PageBuilder:
        """Add a drawing to the page."""
        self._drawings.append(
            {
                "bbox": bbox,
                "level": level,
                "seqno": seqno,
            }
        )
        # Drawings often don't appear in bboxlog exactly as "fill-path",
        # but for matching logic we might need them. Adding as fill-path for now.
        self._bboxlog.append(("fill-path", bbox))
        return self

    def _make_texttrace(self) -> list[TexttraceSpanDict]:
        """Generate texttrace structure from added text spans."""
        spans: list[TexttraceSpanDict] = []
        for span in self._text_spans:
            bbox = span["bbox"]
            text = span["text"]
            # Generate fake chars
            chars: list[TexttraceChar] = [
                (ord(c), i, (bbox[0], bbox[3]), bbox) for i, c in enumerate(text)
            ]
            spans.append(
                {
                    "bbox": bbox,
                    "font": span["font"],
                    "size": span["size"],
                    "seqno": span["seqno"],
                    "chars": chars,
                    # "text": text, # texttrace doesn't have "text" field directly on span?
                    # Actually, TexttraceSpanDict definition in pymupdf_types.py implies it does NOT have 'text'.
                    # But our make_texttrace_span helper in tests put it there? No, it put 'chars'.
                    # Wait, Extractor._extract_text_blocks_from_texttrace uses span.get("bbox") and Text.from_texttrace_span.
                    # Text.from_texttrace_span likely reconstructs text from chars.
                }
            )
        return spans

    def _make_rawdict(self) -> dict[str, Any]:
        """Generate rawdict structure from added text spans."""
        lines = []
        for span in self._text_spans:
            bbox = span["bbox"]
            text = span["text"]
            chars = [
                {
                    "c": c,
                    "bbox": (
                        bbox[0] + i * 4,
                        bbox[1],
                        bbox[0] + (i + 1) * 4,
                        bbox[3],
                    ),
                }
                for i, c in enumerate(text)
            ]
            lines.append(
                {
                    "spans": [
                        {
                            "bbox": bbox,
                            "font": span["font"],
                            "size": span["size"],
                            "chars": chars,
                            "origin": (bbox[0], bbox[3]),
                        }
                    ],
                    "bbox": bbox,
                }
            )

        return {
            "blocks": [
                {
                    "type": 0,  # text block
                    "lines": lines,
                }
            ]
        }

    def _make_image_info(self) -> list[ImageInfoDict]:
        """Generate get_image_info structure."""
        return [
            {
                "number": img["number"],
                "bbox": img["bbox"],
                "width": img["width"],
                "height": img["height"],
                "colorspace": 3,
                "xres": 96,
                "yres": 96,
                "bpc": 8,
                "size": 1000,
                "transform": (1.0, 0.0, 0.0, 1.0, img["bbox"][0], img["bbox"][1]),
                "xref": img["xref"],
                "digest": b"fake_digest",  # Added because ImageInfoDict might expect it
            }
            for img in self._images
        ]

    def _make_images(self) -> list[tuple]:
        """Generate get_images structure."""
        return [
            (
                img["xref"],
                0,  # smask
                img["width"],
                img["height"],
                8,  # bpc
                3,  # colorspace
                "",
                img["image_id"],
                "DCTDecode",
                0,
            )
            for img in self._images
        ]

    def _make_drawings(self) -> list[dict[str, Any]]:
        """Generate get_drawings structure."""
        return [
            {
                "rect": pymupdf.Rect(d["bbox"]),
                "level": d["level"],
                "seqno": d["seqno"],
                "items": [("re", pymupdf.Rect(d["bbox"]))],  # Dummy item
            }
            for d in self._drawings
        ]

    def build_mock_page(self) -> MagicMock:
        """Build and return a configured MagicMock page."""
        mock_page = MagicMock()
        mock_page.rect = MagicMock(x0=0, y0=0, x1=612, y1=792)
        mock_page.transformation_matrix = pymupdf.Identity

        # Configure get_text
        rawdict = self._make_rawdict()
        texttrace = self._make_texttrace()

        def get_text_side_effect(option="text", flags=None):
            if option == "dict":  # Used by Extractor when use_rawdict=True
                return rawdict
            return ""

        mock_page.get_text.side_effect = get_text_side_effect
        mock_page.get_texttrace.return_value = texttrace

        # Configure image methods
        mock_page.get_bboxlog.return_value = self._bboxlog
        mock_page.get_image_info.return_value = self._make_image_info()
        mock_page.get_images.return_value = self._make_images()

        # Configure drawings
        mock_page.get_drawings.return_value = self._make_drawings()

        return mock_page

1	from __future__ import annotations	1✔
2
3	from typing import Any	1✔
4	from unittest.mock import MagicMock	1✔
5
6	import pymupdf	1✔
7
8	from build_a_long.pdf_extract.extractor.pymupdf_types import (	1✔
9	ImageInfoDict,
10	RectLikeTuple,
11	TexttraceChar,
12	TexttraceSpanDict,
13	)
14
15
16	class PageBuilder:	1✔
17	"""Builder for creating mock PyMuPDF pages for testing."""
18
19	def __init__(self) -> None:	1✔
20	self._text_spans: list[dict[str, Any]] = []	1✔
21	self._images: list[dict[str, Any]] = []	1✔
22	self._drawings: list[dict[str, Any]] = []	1✔
23	self._bboxlog: list[tuple[str, tuple[float, float, float, float]]] = []	1✔
24
25	def add_text(	1✔
26	self,
27	text: str,
28	bbox: RectLikeTuple,
29	font: str = "Arial",
30	size: float = 12.0,
31	seqno: int \| None = None,
32	) -> PageBuilder:
33	"""Add a text span to the page."""
34	self._text_spans.append(	1✔
35	{
36	"text": text,
37	"bbox": bbox,
38	"font": font,
39	"size": size,
40	"seqno": seqno if seqno is not None else len(self._text_spans),
41	}
42	)
43	self._bboxlog.append(("fill-text", bbox))	1✔
44	return self	1✔
45
46	def add_image(	1✔
47	self,
48	bbox: RectLikeTuple,
49	xref: int,
50	number: int,
51	width: int = 100,
52	height: int = 100,
53	image_id: str \| None = None,
54	) -> PageBuilder:
55	"""Add an image to the page."""
56	self._images.append(	1✔
57	{
58	"bbox": bbox,
59	"xref": xref,
60	"number": number,
61	"width": width,
62	"height": height,
63	"image_id": image_id or f"Im{xref}",
64	}
65	)
66	self._bboxlog.append(("fill-image", bbox))	1✔
67	return self	1✔
68
69	def add_drawing(	1✔
70	self,
71	bbox: RectLikeTuple,
72	level: int = 0,
73	seqno: int \| None = None,
74	) -> PageBuilder:
75	"""Add a drawing to the page."""
76	self._drawings.append(	1✔
77	{
78	"bbox": bbox,
79	"level": level,
80	"seqno": seqno,
81	}
82	)
83	# Drawings often don't appear in bboxlog exactly as "fill-path",
84	# but for matching logic we might need them. Adding as fill-path for now.
85	self._bboxlog.append(("fill-path", bbox))	1✔
86	return self	1✔
87
88	def _make_texttrace(self) -> list[TexttraceSpanDict]:	1✔
89	"""Generate texttrace structure from added text spans."""
90	spans: list[TexttraceSpanDict] = []	1✔
91	for span in self._text_spans:	1✔
92	bbox = span["bbox"]	1✔
93	text = span["text"]	1✔
94	# Generate fake chars
95	chars: list[TexttraceChar] = [	1✔
96	(ord(c), i, (bbox[0], bbox[3]), bbox) for i, c in enumerate(text)
97	]
98	spans.append(	1✔
99	{
100	"bbox": bbox,
101	"font": span["font"],
102	"size": span["size"],
103	"seqno": span["seqno"],
104	"chars": chars,
105	# "text": text, # texttrace doesn't have "text" field directly on span?
106	# Actually, TexttraceSpanDict definition in pymupdf_types.py implies it does NOT have 'text'.
107	# But our make_texttrace_span helper in tests put it there? No, it put 'chars'.
108	# Wait, Extractor._extract_text_blocks_from_texttrace uses span.get("bbox") and Text.from_texttrace_span.
109	# Text.from_texttrace_span likely reconstructs text from chars.
110	}
111	)
112	return spans	1✔
113
114	def _make_rawdict(self) -> dict[str, Any]:	1✔
115	"""Generate rawdict structure from added text spans."""
116	lines = []	1✔
117	for span in self._text_spans:	1✔
118	bbox = span["bbox"]	1✔
119	text = span["text"]	1✔
120	chars = [	1✔
121	{
122	"c": c,
123	"bbox": (
124	bbox[0] + i * 4,
125	bbox[1],
126	bbox[0] + (i + 1) * 4,
127	bbox[3],
128	),
129	}
130	for i, c in enumerate(text)
131	]
132	lines.append(	1✔
133	{
134	"spans": [
135	{
136	"bbox": bbox,
137	"font": span["font"],
138	"size": span["size"],
139	"chars": chars,
140	"origin": (bbox[0], bbox[3]),
141	}
142	],
143	"bbox": bbox,
144	}
145	)
146
147	return {	1✔
148	"blocks": [
149	{
150	"type": 0, # text block
151	"lines": lines,
152	}
153	]
154	}
155
156	def _make_image_info(self) -> list[ImageInfoDict]:	1✔
157	"""Generate get_image_info structure."""
158	return [	1✔
159	{
160	"number": img["number"],
161	"bbox": img["bbox"],
162	"width": img["width"],
163	"height": img["height"],
164	"colorspace": 3,
165	"xres": 96,
166	"yres": 96,
167	"bpc": 8,
168	"size": 1000,
169	"transform": (1.0, 0.0, 0.0, 1.0, img["bbox"][0], img["bbox"][1]),
170	"xref": img["xref"],
171	"digest": b"fake_digest", # Added because ImageInfoDict might expect it
172	}
173	for img in self._images
174	]
175
176	def _make_images(self) -> list[tuple]:	1✔
177	"""Generate get_images structure."""
178	return [	1✔
179	(
180	img["xref"],
181	0, # smask
182	img["width"],
183	img["height"],
184	8, # bpc
185	3, # colorspace
186	"",
187	img["image_id"],
188	"DCTDecode",
189	0,
190	)
191	for img in self._images
192	]
193
194	def _make_drawings(self) -> list[dict[str, Any]]:	1✔
195	"""Generate get_drawings structure."""
196	return [	1✔
197	{
198	"rect": pymupdf.Rect(d["bbox"]),
199	"level": d["level"],
200	"seqno": d["seqno"],
201	"items": [("re", pymupdf.Rect(d["bbox"]))], # Dummy item
202	}
203	for d in self._drawings
204	]
205
206	def build_mock_page(self) -> MagicMock:	1✔
207	"""Build and return a configured MagicMock page."""
208	mock_page = MagicMock()	1✔
209	mock_page.rect = MagicMock(x0=0, y0=0, x1=612, y1=792)	1✔
210	mock_page.transformation_matrix = pymupdf.Identity	1✔
211
212	# Configure get_text
213	rawdict = self._make_rawdict()	1✔
214	texttrace = self._make_texttrace()	1✔
215
216	def get_text_side_effect(option="text", flags=None):	1✔
217	if option == "dict": # Used by Extractor when use_rawdict=True	1✔
218	return rawdict	1✔
219	return ""	×
220
221	mock_page.get_text.side_effect = get_text_side_effect	1✔
222	mock_page.get_texttrace.return_value = texttrace	1✔
223
224	# Configure image methods
225	mock_page.get_bboxlog.return_value = self._bboxlog	1✔
226	mock_page.get_image_info.return_value = self._make_image_info()	1✔
227	mock_page.get_images.return_value = self._make_images()	1✔
228
229	# Configure drawings
230	mock_page.get_drawings.return_value = self._make_drawings()	1✔
231
232	return mock_page	1✔

bramp / build-along / 20258147097

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous