• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

speedyk-005 / chunklet-py / 23031162841

13 Mar 2026 12:54AM UTC coverage: 90.659% (-0.01%) from 90.671%
23031162841

Pull #14

github

web-flow
Merge a3ae6968f into 4c6b47c93
Pull Request #14: Refactor method ordering to follow Step-down Rule

352 of 372 new or added lines in 8 files covered. (94.62%)

1 existing line in 1 file now uncovered.

1349 of 1488 relevant lines covered (90.66%)

4.53 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.61
/src/chunklet/document_chunker/processors/pdf_processor.py
1
from typing import Any, Generator
5✔
2

3
import regex as re
5✔
4
from more_itertools import ilen
5✔
5

6
# pdfminer is lazily imported
7
from chunklet.document_chunker.processors.base_processor import BaseProcessor
5✔
8

9
# Pattern to normalize consecutive newlines
10
MULTIPLE_NEWLINE_PATTERN = re.compile(r"(\n\s*){2,}")
5✔
11

12
# Pattern to remove lines with only numbers
13
STANDALONE_NUMBER_PATTERN = re.compile(r"\n\s*\p{N}+\s*\n")
5✔
14

15
# Pattern to merge single newlines within logical text blocks
16
HEADING_OR_LIST_PATTERN = re.compile(
5✔
17
    r"(\n"  # A newline
18
    r"[\p{L}\p{N}]"  # A Unicode letter or a Unicode number
19
    r"[.\-)*]"  # Followed by a punctuation
20
    r")\n",  # The newline character to be replaced
21
    re.U,
22
)
23

24
PAGE_PATTERN = re.compile(
5✔
25
    r"Page \p{N}+ of \p{N}+.*|"  # standalone page number
26
    r"-\s*\p{N}+\s*-|"  # Page numbers with dashes
27
    r"\s*\|\s*Page\s+\p{N}+\s*\|\s*",  # Boxed page numbers
28
    re.M,
29
)
30

31

32
class PDFProcessor(BaseProcessor):
5✔
33
    """
34
    PDF extraction and cleanup utility using `pdfminer.six`.
35

36
    Provides methods to extract text and metadata from PDF files,
37
    while cleaning and normalizing the extracted text using regex patterns.
38

39
    This processor extracts **metadata** from the PDF document's **information
40
    dictionary**, focusing on core metadata rather than all available fields.
41

42
    For more details on PDF metadata extraction using `pdfminer.six`, refer to
43
    this relevant Stack Overflow discussion:
44

45
    https://stackoverflow.com/questions/75591385/extract-metadata-info-from-online-pdf-using-pdfminer-in-python
46
    """
47

48
    METADATA_FIELDS = [
5✔
49
        "title",
50
        "author",
51
        "creator",
52
        "producer",
53
        "publisher",
54
        "created",
55
        "modified",
56
    ]
57

58
    PDF_METADATA_KEY_MAP = {
5✔
59
        "CreationDate": "created",
60
        "ModDate": "modified",
61
    }
62

63
    def __init__(self, file_path: str):
5✔
64
        """Initialize the PDFProcessor.
65

66
        Args:
67
            file_path (str): Path to the PDF file.
68
        """
69
        try:
5✔
70
            from pdfminer.layout import LAParams
5✔
71
        except ImportError as e:  # pragma: no cover
72
            raise ImportError(
73
                "The 'pdfminer.six' library is not installed. "
74
                "Please install it with 'pip install 'pdfminer.six>=20250324'' or install the document processing extras "
75
                "with 'pip install 'chunklet-py[structured-document]''"
76
            ) from e
77
        self.file_path = file_path
5✔
78
        self.laparams = LAParams(
5✔
79
            line_margin=0.5,
80
        )
81

82
    def extract_text(self) -> Generator[str, None, None]:
5✔
83
        """Yield cleaned text from each PDF page.
84

85
        Extracts text content page by page using pdfminer.high_level.extract_text
86
        for efficient processing. Each page is processed individually to avoid
87
        memory issues with large PDF files. The extracted text is cleaned using
88
        the _cleanup_text method to remove artifacts and normalize formatting.
89

90
        Yields:
91
            str: Cleaned text content from each PDF page.
92
        """
93
        from pdfminer.high_level import extract_text
5✔
94
        from pdfminer.pdfpage import PDFPage
5✔
95

96
        with open(self.file_path, "rb") as fp:
5✔
97
            page_count = ilen(PDFPage.get_pages(fp))
5✔
98

99
            for page_num in range(page_count):
5✔
100
                # Call extract_text on the file path, specifying the page number.
101
                # This is efficient as it avoids repeated file seeks/parsing
102
                # within the loop that was present in the old `extract_text_to_fp` approach.
103
                raw_text = extract_text(
5✔
104
                    self.file_path,
105
                    page_numbers=[page_num],
106
                    laparams=self.laparams,
107
                )
108
                yield self._cleanup_text(raw_text)
5✔
109

110
    def _cleanup_text(self, text: str) -> str:
5✔
111
        """Clean and normalize extracted PDF text.
112

113
        Performs:
114
            - Collapse multiple newlines
115
            - Remove lines containing only numbers (page numbers)
116
            - Split concatenated words with punctuation and numbers
117
            - Collapse multiple spaces
118
            - Remove zero-width / non-breaking characters
119

120
        Args:
121
            text (str): Raw text extracted from PDF page.
122

123
        Returns:
124
            str: Cleaned and normalized text.
125
        """
126
        if not text:
5✔
NEW
127
            return ""
×
128
        text = MULTIPLE_NEWLINE_PATTERN.sub("\n", text)
5✔
129
        text = HEADING_OR_LIST_PATTERN.sub(r"\1 ", text)
5✔
130
        text = STANDALONE_NUMBER_PATTERN.sub("", text)
5✔
131
        text = PAGE_PATTERN.sub("", text)
5✔
132
        return text
5✔
133

134
    def extract_metadata(self) -> dict[str, Any]:
5✔
135
        """Extracts metadata from the PDF document's information dictionary.
136

137
        Includes source path, page count, and PDF info fields.
138

139
        Returns:
140
            dict[str, Any]: A dictionary containing metadata fields:
141
                - title
142
                - author
143
                - creator
144
                - producer
145
                - publisher
146
                - created
147
                - modified
148
        """
149
        from pdfminer.pdfdocument import PDFDocument
5✔
150
        from pdfminer.pdfpage import PDFPage
5✔
151
        from pdfminer.pdfparser import PDFParser
5✔
152

153
        metadata = {"source": str(self.file_path), "page_count": 0}
5✔
154
        with open(self.file_path, "rb") as f:
5✔
155
            # Initialize parser on the file stream
156
            parser = PDFParser(f)
5✔
157

158
            # PDFDocument reads file structure, consuming the file pointer
159
            doc = PDFDocument(parser)
5✔
160

161
            # Reset pointer to start of file stream for accurate page counting
162
            f.seek(0)
5✔
163

164
            metadata["page_count"] = ilen(PDFPage.get_pages(f))
5✔
165
            metadata.update(self._extract_info_metadata(doc))
5✔
166

167
        return metadata
5✔
168

169
    def _extract_info_metadata(self, doc: Any) -> dict:
5✔
170
        """Extract metadata from PDF document info dictionary.
171

172
        Reads PDF info fields and extracts standardized metadata fields
173
        defined in METADATA_FIELDS.
174

175
        Args:
176
            doc (Any): PDFDocument instance with info attribute.
177

178
        Returns:
179
            dict: Dictionary of normalized metadata key-value pairs.
180
        """
181
        metadata = {}
5✔
182
        if not (hasattr(doc, "info") and doc.info):
5✔
NEW
183
            return metadata
×
184

185
        for info in doc.info:
5✔
186
            for k, v in info.items():
5✔
187
                k = self.PDF_METADATA_KEY_MAP.get(
5✔
188
                    self._safe_decode(k), self._safe_decode(k)
189
                )
190
                v = self._safe_decode(v)
5✔
191
                if k.lower() in self.METADATA_FIELDS:
5✔
192
                    metadata[k.lower()] = v
5✔
193
        return metadata
5✔
194

195
    def _safe_decode(self, value: str | bytes):
5✔
196
        """Utility to decode bytes to str, ignoring errors, otherwise return as-is.
197

198
        Args:
199
            value (str | bytes): The input value, which may be a string or a byte sequence.
200

201
        Returns:
202
            str: The decoded string if the input was bytes, or the original string
203
                 if the input was already a string.
204
        """
205
        if isinstance(value, bytes):
5✔
206
            return value.decode("utf-8", "ignore")
5✔
207
        return value
5✔
208

209

210
# --- Example usage ---
211
if __name__ == "__main__":  # pragma: no cover
212
    pdf_file = "samples/sample-pdf-a4-size.pdf"
213

214
    processor = PDFProcessor(pdf_file)
215
    meta = processor.extract_metadata()
216

217
    print("Metadata:")
218
    for k, v in meta.items():
219
        print(f"{k}: {v}")
220

221
    print("\nText content preview:\n")
222
    for i, page_text in enumerate(processor.extract_text(), start=1):
223
        print(f"--- page {i} ---")
224
        print(page_text[:512], "...")
225
        print("\n --- \n")
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc