24798516591

Committed 22 Apr 2026 07:19PM UTC coverage: 90.606% (-0.2%) from 90.758%

Build # 24798516591

Build Type

push

github

Committed by

speedyk-005

Commit Message

refactor: remove redundant type hints from docstrings

- Strip (type) from Args/Returns where signature already has types
- Simplify Returns format to prose description
- Run clean_docstrings.py on src/chunklet (26 files)
- Add ExtractionState TypedDict for type safety (from earlier refactor)

Coverage Stats

1360 of 1501 relevant lines covered (90.61%)

3.62 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.61

/src/chunklet/document_chunker/processors/pdf_processor.py

from typing import Any, Generator

import re
from more_itertools import ilen

# pdfminer is lazily imported

from chunklet.document_chunker.processors.base_processor import BaseProcessor


MULTIPLE_NEWLINE_PATTERN = re.compile(r"(\n\s*){2,}")
STANDALONE_NUMBER_PATTERN = re.compile(r"\n\s*\d+\s*\n")

# Pattern to merge single newlines within logical text blocks
HEADING_OR_LIST_PATTERN = re.compile(
    r"(\n"  # A newline
    r"[\w\d]"  # A Unicode letter or number
    r"[.\-)*]"  # Followed by a punctuation
    r")\n",  # The newline character to be replaced
    re.U,
)

PAGE_PATTERN = re.compile(
    r"Page \d+ of \d+.*|"  # standalone page number
    r"-\s*\d+\s*-|"  # Page numbers with dashes
    r"\s*\|\s*Page\s+\d+\s*\|\s*",  # Boxed page numbers
    re.M,
)


class PDFProcessor(BaseProcessor):
    """
    PDF extraction and cleanup utility using `pdfminer.six`.

    Provides methods to extract text and metadata from PDF files,
    while cleaning and normalizing the extracted text using regex patterns.

    This processor extracts **metadata** from the PDF document's **information
    dictionary**, focusing on core metadata rather than all available fields.

    For more details on PDF metadata extraction using `pdfminer.six`, refer to
    this relevant Stack Overflow discussion:

    https://stackoverflow.com/questions/75591385/extract-metadata-info-from-online-pdf-using-pdfminer-in-python
    """

    METADATA_FIELDS = [
        "title",
        "author",
        "creator",
        "producer",
        "publisher",
        "created",
        "modified",
    ]

    PDF_METADATA_KEY_MAP = {
        "CreationDate": "created",
        "ModDate": "modified",
    }

    def __init__(self, file_path: str):
        """Initialize the PDFProcessor.

        Args:
            file_path: Path to the PDF file.
        """
        try:
            from pdfminer.layout import LAParams
        except ImportError as e:  # pragma: no cover
            raise ImportError(
                "The 'pdfminer.six' library is not installed. "
                "Please install it with 'pip install 'pdfminer.six>=20250324'' or install the document processing extras "
                "with 'pip install 'chunklet-py[structured-document]''"
            ) from e
        self.file_path = file_path
        self.laparams = LAParams(
            line_margin=0.5,
        )

    def _cleanup_text(self, text: str) -> str:
        """Clean and normalize extracted PDF text.

        Performs:
            - Collapse multiple newlines
            - Remove lines containing only numbers (page numbers)
            - Split concatenated words with punctuation and numbers
            - Collapse multiple spaces
            - Remove zero-width / non-breaking characters

        Args:
            text: Raw text extracted from PDF page.

        Returns:
            Cleaned and normalized text.
        """
        if not text:
            return ""
        text = MULTIPLE_NEWLINE_PATTERN.sub("\n", text)
        text = HEADING_OR_LIST_PATTERN.sub(r"\1 ", text)
        text = STANDALONE_NUMBER_PATTERN.sub("", text)
        text = PAGE_PATTERN.sub("", text)
        return text

    def _safe_decode(self, value: str | bytes):
        """Utility to decode bytes to str, ignoring errors, otherwise return as-is.

        Args:
            value: The input value, which may be a string or a byte sequence.

        Returns:
            The decoded string if the input was bytes, or the original string
                 if the input was already a string.
        """
        if isinstance(value, bytes):
            return value.decode("utf-8", "ignore")
        return value

    def _extract_info_metadata(self, doc: Any) -> dict:
        """Extract metadata from PDF document info dictionary.

        Reads PDF info fields and extracts standardized metadata fields
        defined in METADATA_FIELDS.

        Args:
            doc: PDFDocument instance with info attribute.

        Returns:
            Dictionary of normalized metadata key-value pairs.
        """
        metadata = {}
        if not (hasattr(doc, "info") and doc.info):
            return metadata

        for info in doc.info:
            for k, v in info.items():
                k = self.PDF_METADATA_KEY_MAP.get(
                    self._safe_decode(k), self._safe_decode(k)
                )
                v = self._safe_decode(v)
                if k.lower() in self.METADATA_FIELDS:
                    metadata[k.lower()] = v
        return metadata

    def extract_text(self) -> Generator[str, None, None]:
        """Yield cleaned text from each PDF page.

        Extracts text content page by page using pdfminer.high_level.extract_text
        for efficient processing. Each page is processed individually to avoid
        memory issues with large PDF files. The extracted text is cleaned using
        the _cleanup_text method to remove artifacts and normalize formatting.

        Yields:
            Cleaned text content from each PDF page.
        """
        from pdfminer.high_level import extract_text
        from pdfminer.pdfpage import PDFPage

        with open(self.file_path, "rb") as fp:
            page_count = ilen(PDFPage.get_pages(fp))

            for page_num in range(page_count):
                # Call extract_text on the file path, specifying the page number.
                # This is efficient as it avoids repeated file seeks/parsing
                # within the loop that was present in the old `extract_text_to_fp` approach.
                raw_text = extract_text(
                    self.file_path,
                    page_numbers=[page_num],
                    laparams=self.laparams,
                )
                yield self._cleanup_text(raw_text)

    def extract_metadata(self) -> dict[str, Any]:
        """Extracts metadata from the PDF document's information dictionary.

        Includes source path, page count, and PDF info fields.

        Returns:
            A dictionary containing metadata fields:
                - title
                - author
                - creator
                - producer
                - publisher
                - created
                - modified
        """
        from pdfminer.pdfdocument import PDFDocument
        from pdfminer.pdfpage import PDFPage
        from pdfminer.pdfparser import PDFParser

        metadata = {"source": str(self.file_path), "page_count": 0}
        with open(self.file_path, "rb") as f:
            # Initialize parser on the file stream
            parser = PDFParser(f)

            # PDFDocument reads file structure, consuming the file pointer
            doc = PDFDocument(parser)

            # Reset pointer to start of file stream for accurate page counting
            f.seek(0)

            metadata["page_count"] = ilen(PDFPage.get_pages(f))
            metadata.update(self._extract_info_metadata(doc))

        return metadata


# --- Example usage ---
if __name__ == "__main__":  # pragma: no cover
    pdf_file = "samples/sample-pdf-a4-size.pdf"

    processor = PDFProcessor(pdf_file)
    meta = processor.extract_metadata()

    print("Metadata:")
    for k, v in meta.items():
        print(f"{k}: {v}")

    print("\nText content preview:\n")
    for i, page_text in enumerate(processor.extract_text(), start=1):
        print(f"--- page {i} ---")
        print(page_text[:512], "...")
        print("\n --- \n")

1	from typing import Any, Generator	4✔
2
3	import re	4✔
4	from more_itertools import ilen	4✔
5
6	# pdfminer is lazily imported
7
8	from chunklet.document_chunker.processors.base_processor import BaseProcessor	4✔
9
10
11	MULTIPLE_NEWLINE_PATTERN = re.compile(r"(\n\s*){2,}")	4✔
12	STANDALONE_NUMBER_PATTERN = re.compile(r"\n\s\d+\s\n")	4✔
13
14	# Pattern to merge single newlines within logical text blocks
15	HEADING_OR_LIST_PATTERN = re.compile(	4✔
16	r"(\n" # A newline
17	r"[\w\d]" # A Unicode letter or number
18	r"[.\-)*]" # Followed by a punctuation
19	r")\n", # The newline character to be replaced
20	re.U,
21	)
22
23	PAGE_PATTERN = re.compile(	4✔
24	r"Page \d+ of \d+.*\|" # standalone page number
25	r"-\s\d+\s-\|" # Page numbers with dashes
26	r"\s\\|\sPage\s+\d+\s\\|\s", # Boxed page numbers
27	re.M,
28	)
29
30
31	class PDFProcessor(BaseProcessor):	4✔
32	"""
33	PDF extraction and cleanup utility using `pdfminer.six`.
34
35	Provides methods to extract text and metadata from PDF files,
36	while cleaning and normalizing the extracted text using regex patterns.
37
38	This processor extracts metadata from the PDF document's **information
39	dictionary**, focusing on core metadata rather than all available fields.
40
41	For more details on PDF metadata extraction using `pdfminer.six`, refer to
42	this relevant Stack Overflow discussion:
43
44	https://stackoverflow.com/questions/75591385/extract-metadata-info-from-online-pdf-using-pdfminer-in-python
45	"""
46
47	METADATA_FIELDS = [	4✔
48	"title",
49	"author",
50	"creator",
51	"producer",
52	"publisher",
53	"created",
54	"modified",
55	]
56
57	PDF_METADATA_KEY_MAP = {	4✔
58	"CreationDate": "created",
59	"ModDate": "modified",
60	}
61
62	def __init__(self, file_path: str):	4✔
63	"""Initialize the PDFProcessor.
64
65	Args:
66	file_path: Path to the PDF file.
67	"""
68	try:	4✔
69	from pdfminer.layout import LAParams	4✔
70	except ImportError as e: # pragma: no cover
71	raise ImportError(
72	"The 'pdfminer.six' library is not installed. "
73	"Please install it with 'pip install 'pdfminer.six>=20250324'' or install the document processing extras "
74	"with 'pip install 'chunklet-py[structured-document]''"
75	) from e
76	self.file_path = file_path	4✔
77	self.laparams = LAParams(	4✔
78	line_margin=0.5,
79	)
80
81	def _cleanup_text(self, text: str) -> str:	4✔
82	"""Clean and normalize extracted PDF text.
83
84	Performs:
85	- Collapse multiple newlines
86	- Remove lines containing only numbers (page numbers)
87	- Split concatenated words with punctuation and numbers
88	- Collapse multiple spaces
89	- Remove zero-width / non-breaking characters
90
91	Args:
92	text: Raw text extracted from PDF page.
93
94	Returns:
95	Cleaned and normalized text.
96	"""
97	if not text:	4✔
98	return ""	×
99	text = MULTIPLE_NEWLINE_PATTERN.sub("\n", text)	4✔
100	text = HEADING_OR_LIST_PATTERN.sub(r"\1 ", text)	4✔
101	text = STANDALONE_NUMBER_PATTERN.sub("", text)	4✔
102	text = PAGE_PATTERN.sub("", text)	4✔
103	return text	4✔
104
105	def _safe_decode(self, value: str \| bytes):	4✔
106	"""Utility to decode bytes to str, ignoring errors, otherwise return as-is.
107
108	Args:
109	value: The input value, which may be a string or a byte sequence.
110
111	Returns:
112	The decoded string if the input was bytes, or the original string
113	if the input was already a string.
114	"""
115	if isinstance(value, bytes):	4✔
116	return value.decode("utf-8", "ignore")	4✔
117	return value	4✔
118
119	def _extract_info_metadata(self, doc: Any) -> dict:	4✔
120	"""Extract metadata from PDF document info dictionary.
121
122	Reads PDF info fields and extracts standardized metadata fields
123	defined in METADATA_FIELDS.
124
125	Args:
126	doc: PDFDocument instance with info attribute.
127
128	Returns:
129	Dictionary of normalized metadata key-value pairs.
130	"""
131	metadata = {}	4✔
132	if not (hasattr(doc, "info") and doc.info):	4✔
133	return metadata	×
134
135	for info in doc.info:	4✔
136	for k, v in info.items():	4✔
137	k = self.PDF_METADATA_KEY_MAP.get(	4✔
138	self._safe_decode(k), self._safe_decode(k)
139	)
140	v = self._safe_decode(v)	4✔
141	if k.lower() in self.METADATA_FIELDS:	4✔
142	metadata[k.lower()] = v	4✔
143	return metadata	4✔
144
145	def extract_text(self) -> Generator[str, None, None]:	4✔
146	"""Yield cleaned text from each PDF page.
147
148	Extracts text content page by page using pdfminer.high_level.extract_text
149	for efficient processing. Each page is processed individually to avoid
150	memory issues with large PDF files. The extracted text is cleaned using
151	the _cleanup_text method to remove artifacts and normalize formatting.
152
153	Yields:
154	Cleaned text content from each PDF page.
155	"""
156	from pdfminer.high_level import extract_text	4✔
157	from pdfminer.pdfpage import PDFPage	4✔
158
159	with open(self.file_path, "rb") as fp:	4✔
160	page_count = ilen(PDFPage.get_pages(fp))	4✔
161
162	for page_num in range(page_count):	4✔
163	# Call extract_text on the file path, specifying the page number.
164	# This is efficient as it avoids repeated file seeks/parsing
165	# within the loop that was present in the old `extract_text_to_fp` approach.
166	raw_text = extract_text(	4✔
167	self.file_path,
168	page_numbers=[page_num],
169	laparams=self.laparams,
170	)
171	yield self._cleanup_text(raw_text)	4✔
172
173	def extract_metadata(self) -> dict[str, Any]:	4✔
174	"""Extracts metadata from the PDF document's information dictionary.
175
176	Includes source path, page count, and PDF info fields.
177
178	Returns:
179	A dictionary containing metadata fields:
180	- title
181	- author
182	- creator
183	- producer
184	- publisher
185	- created
186	- modified
187	"""
188	from pdfminer.pdfdocument import PDFDocument	4✔
189	from pdfminer.pdfpage import PDFPage	4✔
190	from pdfminer.pdfparser import PDFParser	4✔
191
192	metadata = {"source": str(self.file_path), "page_count": 0}	4✔
193	with open(self.file_path, "rb") as f:	4✔
194	# Initialize parser on the file stream
195	parser = PDFParser(f)	4✔
196
197	# PDFDocument reads file structure, consuming the file pointer
198	doc = PDFDocument(parser)	4✔
199
200	# Reset pointer to start of file stream for accurate page counting
201	f.seek(0)	4✔
202
203	metadata["page_count"] = ilen(PDFPage.get_pages(f))	4✔
204	metadata.update(self._extract_info_metadata(doc))	4✔
205
206	return metadata	4✔
207
208
209	# --- Example usage ---
210	if __name__ == "__main__": # pragma: no cover
211	pdf_file = "samples/sample-pdf-a4-size.pdf"
212
213	processor = PDFProcessor(pdf_file)
214	meta = processor.extract_metadata()
215
216	print("Metadata:")
217	for k, v in meta.items():
218	print(f"{k}: {v}")
219
220	print("\nText content preview:\n")
221	for i, page_text in enumerate(processor.extract_text(), start=1):
222	print(f"--- page {i} ---")
223	print(page_text[:512], "...")
224	print("\n --- \n")

speedyk-005 / chunklet-py / 24798516591

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous