• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

speedyk-005 / chunklet-py / 24798516591

22 Apr 2026 07:19PM UTC coverage: 90.606% (-0.2%) from 90.758%
24798516591

push

github

speedyk-005
refactor: remove redundant type hints from docstrings

- Strip (type) from Args/Returns where signature already has types
- Simplify Returns format to prose description
- Run clean_docstrings.py on src/chunklet (26 files)
- Add ExtractionState TypedDict for type safety (from earlier refactor)

1360 of 1501 relevant lines covered (90.61%)

3.62 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.61
/src/chunklet/document_chunker/processors/pdf_processor.py
1
from typing import Any, Generator
4✔
2

3
import re
4✔
4
from more_itertools import ilen
4✔
5

6
# pdfminer is lazily imported
7

8
from chunklet.document_chunker.processors.base_processor import BaseProcessor
4✔
9

10

11
MULTIPLE_NEWLINE_PATTERN = re.compile(r"(\n\s*){2,}")
4✔
12
STANDALONE_NUMBER_PATTERN = re.compile(r"\n\s*\d+\s*\n")
4✔
13

14
# Pattern to merge single newlines within logical text blocks
15
HEADING_OR_LIST_PATTERN = re.compile(
4✔
16
    r"(\n"  # A newline
17
    r"[\w\d]"  # A Unicode letter or number
18
    r"[.\-)*]"  # Followed by a punctuation
19
    r")\n",  # The newline character to be replaced
20
    re.U,
21
)
22

23
PAGE_PATTERN = re.compile(
4✔
24
    r"Page \d+ of \d+.*|"  # standalone page number
25
    r"-\s*\d+\s*-|"  # Page numbers with dashes
26
    r"\s*\|\s*Page\s+\d+\s*\|\s*",  # Boxed page numbers
27
    re.M,
28
)
29

30

31
class PDFProcessor(BaseProcessor):
4✔
32
    """
33
    PDF extraction and cleanup utility using `pdfminer.six`.
34

35
    Provides methods to extract text and metadata from PDF files,
36
    while cleaning and normalizing the extracted text using regex patterns.
37

38
    This processor extracts **metadata** from the PDF document's **information
39
    dictionary**, focusing on core metadata rather than all available fields.
40

41
    For more details on PDF metadata extraction using `pdfminer.six`, refer to
42
    this relevant Stack Overflow discussion:
43

44
    https://stackoverflow.com/questions/75591385/extract-metadata-info-from-online-pdf-using-pdfminer-in-python
45
    """
46

47
    METADATA_FIELDS = [
4✔
48
        "title",
49
        "author",
50
        "creator",
51
        "producer",
52
        "publisher",
53
        "created",
54
        "modified",
55
    ]
56

57
    PDF_METADATA_KEY_MAP = {
4✔
58
        "CreationDate": "created",
59
        "ModDate": "modified",
60
    }
61

62
    def __init__(self, file_path: str):
4✔
63
        """Initialize the PDFProcessor.
64

65
        Args:
66
            file_path: Path to the PDF file.
67
        """
68
        try:
4✔
69
            from pdfminer.layout import LAParams
4✔
70
        except ImportError as e:  # pragma: no cover
71
            raise ImportError(
72
                "The 'pdfminer.six' library is not installed. "
73
                "Please install it with 'pip install 'pdfminer.six>=20250324'' or install the document processing extras "
74
                "with 'pip install 'chunklet-py[structured-document]''"
75
            ) from e
76
        self.file_path = file_path
4✔
77
        self.laparams = LAParams(
4✔
78
            line_margin=0.5,
79
        )
80

81
    def _cleanup_text(self, text: str) -> str:
4✔
82
        """Clean and normalize extracted PDF text.
83

84
        Performs:
85
            - Collapse multiple newlines
86
            - Remove lines containing only numbers (page numbers)
87
            - Split concatenated words with punctuation and numbers
88
            - Collapse multiple spaces
89
            - Remove zero-width / non-breaking characters
90

91
        Args:
92
            text: Raw text extracted from PDF page.
93

94
        Returns:
95
            Cleaned and normalized text.
96
        """
97
        if not text:
4✔
98
            return ""
×
99
        text = MULTIPLE_NEWLINE_PATTERN.sub("\n", text)
4✔
100
        text = HEADING_OR_LIST_PATTERN.sub(r"\1 ", text)
4✔
101
        text = STANDALONE_NUMBER_PATTERN.sub("", text)
4✔
102
        text = PAGE_PATTERN.sub("", text)
4✔
103
        return text
4✔
104

105
    def _safe_decode(self, value: str | bytes):
4✔
106
        """Utility to decode bytes to str, ignoring errors, otherwise return as-is.
107

108
        Args:
109
            value: The input value, which may be a string or a byte sequence.
110

111
        Returns:
112
            The decoded string if the input was bytes, or the original string
113
                 if the input was already a string.
114
        """
115
        if isinstance(value, bytes):
4✔
116
            return value.decode("utf-8", "ignore")
4✔
117
        return value
4✔
118

119
    def _extract_info_metadata(self, doc: Any) -> dict:
4✔
120
        """Extract metadata from PDF document info dictionary.
121

122
        Reads PDF info fields and extracts standardized metadata fields
123
        defined in METADATA_FIELDS.
124

125
        Args:
126
            doc: PDFDocument instance with info attribute.
127

128
        Returns:
129
            Dictionary of normalized metadata key-value pairs.
130
        """
131
        metadata = {}
4✔
132
        if not (hasattr(doc, "info") and doc.info):
4✔
133
            return metadata
×
134

135
        for info in doc.info:
4✔
136
            for k, v in info.items():
4✔
137
                k = self.PDF_METADATA_KEY_MAP.get(
4✔
138
                    self._safe_decode(k), self._safe_decode(k)
139
                )
140
                v = self._safe_decode(v)
4✔
141
                if k.lower() in self.METADATA_FIELDS:
4✔
142
                    metadata[k.lower()] = v
4✔
143
        return metadata
4✔
144

145
    def extract_text(self) -> Generator[str, None, None]:
4✔
146
        """Yield cleaned text from each PDF page.
147

148
        Extracts text content page by page using pdfminer.high_level.extract_text
149
        for efficient processing. Each page is processed individually to avoid
150
        memory issues with large PDF files. The extracted text is cleaned using
151
        the _cleanup_text method to remove artifacts and normalize formatting.
152

153
        Yields:
154
            Cleaned text content from each PDF page.
155
        """
156
        from pdfminer.high_level import extract_text
4✔
157
        from pdfminer.pdfpage import PDFPage
4✔
158

159
        with open(self.file_path, "rb") as fp:
4✔
160
            page_count = ilen(PDFPage.get_pages(fp))
4✔
161

162
            for page_num in range(page_count):
4✔
163
                # Call extract_text on the file path, specifying the page number.
164
                # This is efficient as it avoids repeated file seeks/parsing
165
                # within the loop that was present in the old `extract_text_to_fp` approach.
166
                raw_text = extract_text(
4✔
167
                    self.file_path,
168
                    page_numbers=[page_num],
169
                    laparams=self.laparams,
170
                )
171
                yield self._cleanup_text(raw_text)
4✔
172

173
    def extract_metadata(self) -> dict[str, Any]:
4✔
174
        """Extracts metadata from the PDF document's information dictionary.
175

176
        Includes source path, page count, and PDF info fields.
177

178
        Returns:
179
            A dictionary containing metadata fields:
180
                - title
181
                - author
182
                - creator
183
                - producer
184
                - publisher
185
                - created
186
                - modified
187
        """
188
        from pdfminer.pdfdocument import PDFDocument
4✔
189
        from pdfminer.pdfpage import PDFPage
4✔
190
        from pdfminer.pdfparser import PDFParser
4✔
191

192
        metadata = {"source": str(self.file_path), "page_count": 0}
4✔
193
        with open(self.file_path, "rb") as f:
4✔
194
            # Initialize parser on the file stream
195
            parser = PDFParser(f)
4✔
196

197
            # PDFDocument reads file structure, consuming the file pointer
198
            doc = PDFDocument(parser)
4✔
199

200
            # Reset pointer to start of file stream for accurate page counting
201
            f.seek(0)
4✔
202

203
            metadata["page_count"] = ilen(PDFPage.get_pages(f))
4✔
204
            metadata.update(self._extract_info_metadata(doc))
4✔
205

206
        return metadata
4✔
207

208

209
# --- Example usage ---
210
if __name__ == "__main__":  # pragma: no cover
211
    pdf_file = "samples/sample-pdf-a4-size.pdf"
212

213
    processor = PDFProcessor(pdf_file)
214
    meta = processor.extract_metadata()
215

216
    print("Metadata:")
217
    for k, v in meta.items():
218
        print(f"{k}: {v}")
219

220
    print("\nText content preview:\n")
221
    for i, page_text in enumerate(processor.extract_text(), start=1):
222
        print(f"--- page {i} ---")
223
        print(page_text[:512], "...")
224
        print("\n --- \n")
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc