23031162841

Committed 13 Mar 2026 12:54AM UTC coverage: 90.659% (-0.01%) from 90.671%

Build # 23031162841

Build Type

Pull #14

github

Committed by

web-flow

Commit Message

Merge a3ae6968f into 4c6b47c93

Pull Request Pull Request #14: Refactor method ordering to follow Step-down Rule

Coverage Stats

352 of 372 new or added lines in 8 files covered. (94.62%)

1 existing line in 1 file now uncovered.

1349 of 1488 relevant lines covered (90.66%)

4.53 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.61

/src/chunklet/document_chunker/processors/pdf_processor.py

from typing import Any, Generator

import regex as re
from more_itertools import ilen

# pdfminer is lazily imported
from chunklet.document_chunker.processors.base_processor import BaseProcessor

# Pattern to normalize consecutive newlines
MULTIPLE_NEWLINE_PATTERN = re.compile(r"(\n\s*){2,}")

# Pattern to remove lines with only numbers
STANDALONE_NUMBER_PATTERN = re.compile(r"\n\s*\p{N}+\s*\n")

# Pattern to merge single newlines within logical text blocks
HEADING_OR_LIST_PATTERN = re.compile(
    r"(\n"  # A newline
    r"[\p{L}\p{N}]"  # A Unicode letter or a Unicode number
    r"[.\-)*]"  # Followed by a punctuation
    r")\n",  # The newline character to be replaced
    re.U,
)

PAGE_PATTERN = re.compile(
    r"Page \p{N}+ of \p{N}+.*|"  # standalone page number
    r"-\s*\p{N}+\s*-|"  # Page numbers with dashes
    r"\s*\|\s*Page\s+\p{N}+\s*\|\s*",  # Boxed page numbers
    re.M,
)


class PDFProcessor(BaseProcessor):
    """
    PDF extraction and cleanup utility using `pdfminer.six`.

    Provides methods to extract text and metadata from PDF files,
    while cleaning and normalizing the extracted text using regex patterns.

    This processor extracts **metadata** from the PDF document's **information
    dictionary**, focusing on core metadata rather than all available fields.

    For more details on PDF metadata extraction using `pdfminer.six`, refer to
    this relevant Stack Overflow discussion:

    https://stackoverflow.com/questions/75591385/extract-metadata-info-from-online-pdf-using-pdfminer-in-python
    """

    METADATA_FIELDS = [
        "title",
        "author",
        "creator",
        "producer",
        "publisher",
        "created",
        "modified",
    ]

    PDF_METADATA_KEY_MAP = {
        "CreationDate": "created",
        "ModDate": "modified",
    }

    def __init__(self, file_path: str):
        """Initialize the PDFProcessor.

        Args:
            file_path (str): Path to the PDF file.
        """
        try:
            from pdfminer.layout import LAParams
        except ImportError as e:  # pragma: no cover
            raise ImportError(
                "The 'pdfminer.six' library is not installed. "
                "Please install it with 'pip install 'pdfminer.six>=20250324'' or install the document processing extras "
                "with 'pip install 'chunklet-py[structured-document]''"
            ) from e
        self.file_path = file_path
        self.laparams = LAParams(
            line_margin=0.5,
        )

    def extract_text(self) -> Generator[str, None, None]:
        """Yield cleaned text from each PDF page.

        Extracts text content page by page using pdfminer.high_level.extract_text
        for efficient processing. Each page is processed individually to avoid
        memory issues with large PDF files. The extracted text is cleaned using
        the _cleanup_text method to remove artifacts and normalize formatting.

        Yields:
            str: Cleaned text content from each PDF page.
        """
        from pdfminer.high_level import extract_text
        from pdfminer.pdfpage import PDFPage

        with open(self.file_path, "rb") as fp:
            page_count = ilen(PDFPage.get_pages(fp))

            for page_num in range(page_count):
                # Call extract_text on the file path, specifying the page number.
                # This is efficient as it avoids repeated file seeks/parsing
                # within the loop that was present in the old `extract_text_to_fp` approach.
                raw_text = extract_text(
                    self.file_path,
                    page_numbers=[page_num],
                    laparams=self.laparams,
                )
                yield self._cleanup_text(raw_text)

    def _cleanup_text(self, text: str) -> str:
        """Clean and normalize extracted PDF text.

        Performs:
            - Collapse multiple newlines
            - Remove lines containing only numbers (page numbers)
            - Split concatenated words with punctuation and numbers
            - Collapse multiple spaces
            - Remove zero-width / non-breaking characters

        Args:
            text (str): Raw text extracted from PDF page.

        Returns:
            str: Cleaned and normalized text.
        """
        if not text:
            return ""
        text = MULTIPLE_NEWLINE_PATTERN.sub("\n", text)
        text = HEADING_OR_LIST_PATTERN.sub(r"\1 ", text)
        text = STANDALONE_NUMBER_PATTERN.sub("", text)
        text = PAGE_PATTERN.sub("", text)
        return text

    def extract_metadata(self) -> dict[str, Any]:
        """Extracts metadata from the PDF document's information dictionary.

        Includes source path, page count, and PDF info fields.

        Returns:
            dict[str, Any]: A dictionary containing metadata fields:
                - title
                - author
                - creator
                - producer
                - publisher
                - created
                - modified
        """
        from pdfminer.pdfdocument import PDFDocument
        from pdfminer.pdfpage import PDFPage
        from pdfminer.pdfparser import PDFParser

        metadata = {"source": str(self.file_path), "page_count": 0}
        with open(self.file_path, "rb") as f:
            # Initialize parser on the file stream
            parser = PDFParser(f)

            # PDFDocument reads file structure, consuming the file pointer
            doc = PDFDocument(parser)

            # Reset pointer to start of file stream for accurate page counting
            f.seek(0)

            metadata["page_count"] = ilen(PDFPage.get_pages(f))
            metadata.update(self._extract_info_metadata(doc))

        return metadata

    def _extract_info_metadata(self, doc: Any) -> dict:
        """Extract metadata from PDF document info dictionary.

        Reads PDF info fields and extracts standardized metadata fields
        defined in METADATA_FIELDS.

        Args:
            doc (Any): PDFDocument instance with info attribute.

        Returns:
            dict: Dictionary of normalized metadata key-value pairs.
        """
        metadata = {}
        if not (hasattr(doc, "info") and doc.info):
            return metadata

        for info in doc.info:
            for k, v in info.items():
                k = self.PDF_METADATA_KEY_MAP.get(
                    self._safe_decode(k), self._safe_decode(k)
                )
                v = self._safe_decode(v)
                if k.lower() in self.METADATA_FIELDS:
                    metadata[k.lower()] = v
        return metadata

    def _safe_decode(self, value: str | bytes):
        """Utility to decode bytes to str, ignoring errors, otherwise return as-is.

        Args:
            value (str | bytes): The input value, which may be a string or a byte sequence.

        Returns:
            str: The decoded string if the input was bytes, or the original string
                 if the input was already a string.
        """
        if isinstance(value, bytes):
            return value.decode("utf-8", "ignore")
        return value


# --- Example usage ---
if __name__ == "__main__":  # pragma: no cover
    pdf_file = "samples/sample-pdf-a4-size.pdf"

    processor = PDFProcessor(pdf_file)
    meta = processor.extract_metadata()

    print("Metadata:")
    for k, v in meta.items():
        print(f"{k}: {v}")

    print("\nText content preview:\n")
    for i, page_text in enumerate(processor.extract_text(), start=1):
        print(f"--- page {i} ---")
        print(page_text[:512], "...")
        print("\n --- \n")

1	from typing import Any, Generator	5✔
2
3	import regex as re	5✔
4	from more_itertools import ilen	5✔
5
6	# pdfminer is lazily imported
7	from chunklet.document_chunker.processors.base_processor import BaseProcessor	5✔
8
9	# Pattern to normalize consecutive newlines
10	MULTIPLE_NEWLINE_PATTERN = re.compile(r"(\n\s*){2,}")	5✔
11
12	# Pattern to remove lines with only numbers
13	STANDALONE_NUMBER_PATTERN = re.compile(r"\n\s\p{N}+\s\n")	5✔
14
15	# Pattern to merge single newlines within logical text blocks
16	HEADING_OR_LIST_PATTERN = re.compile(	5✔
17	r"(\n" # A newline
18	r"[\p{L}\p{N}]" # A Unicode letter or a Unicode number
19	r"[.\-)*]" # Followed by a punctuation
20	r")\n", # The newline character to be replaced
21	re.U,
22	)
23
24	PAGE_PATTERN = re.compile(	5✔
25	r"Page \p{N}+ of \p{N}+.*\|" # standalone page number
26	r"-\s\p{N}+\s-\|" # Page numbers with dashes
27	r"\s\\|\sPage\s+\p{N}+\s\\|\s", # Boxed page numbers
28	re.M,
29	)
30
31
32	class PDFProcessor(BaseProcessor):	5✔
33	"""
34	PDF extraction and cleanup utility using `pdfminer.six`.
35
36	Provides methods to extract text and metadata from PDF files,
37	while cleaning and normalizing the extracted text using regex patterns.
38
39	This processor extracts metadata from the PDF document's **information
40	dictionary**, focusing on core metadata rather than all available fields.
41
42	For more details on PDF metadata extraction using `pdfminer.six`, refer to
43	this relevant Stack Overflow discussion:
44
45	https://stackoverflow.com/questions/75591385/extract-metadata-info-from-online-pdf-using-pdfminer-in-python
46	"""
47
48	METADATA_FIELDS = [	5✔
49	"title",
50	"author",
51	"creator",
52	"producer",
53	"publisher",
54	"created",
55	"modified",
56	]
57
58	PDF_METADATA_KEY_MAP = {	5✔
59	"CreationDate": "created",
60	"ModDate": "modified",
61	}
62
63	def __init__(self, file_path: str):	5✔
64	"""Initialize the PDFProcessor.
65
66	Args:
67	file_path (str): Path to the PDF file.
68	"""
69	try:	5✔
70	from pdfminer.layout import LAParams	5✔
71	except ImportError as e: # pragma: no cover
72	raise ImportError(
73	"The 'pdfminer.six' library is not installed. "
74	"Please install it with 'pip install 'pdfminer.six>=20250324'' or install the document processing extras "
75	"with 'pip install 'chunklet-py[structured-document]''"
76	) from e
77	self.file_path = file_path	5✔
78	self.laparams = LAParams(	5✔
79	line_margin=0.5,
80	)
81
82	def extract_text(self) -> Generator[str, None, None]:	5✔
83	"""Yield cleaned text from each PDF page.
84
85	Extracts text content page by page using pdfminer.high_level.extract_text
86	for efficient processing. Each page is processed individually to avoid
87	memory issues with large PDF files. The extracted text is cleaned using
88	the _cleanup_text method to remove artifacts and normalize formatting.
89
90	Yields:
91	str: Cleaned text content from each PDF page.
92	"""
93	from pdfminer.high_level import extract_text	5✔
94	from pdfminer.pdfpage import PDFPage	5✔
95
96	with open(self.file_path, "rb") as fp:	5✔
97	page_count = ilen(PDFPage.get_pages(fp))	5✔
98
99	for page_num in range(page_count):	5✔
100	# Call extract_text on the file path, specifying the page number.
101	# This is efficient as it avoids repeated file seeks/parsing
102	# within the loop that was present in the old `extract_text_to_fp` approach.
103	raw_text = extract_text(	5✔
104	self.file_path,
105	page_numbers=[page_num],
106	laparams=self.laparams,
107	)
108	yield self._cleanup_text(raw_text)	5✔
109
110	def _cleanup_text(self, text: str) -> str:	5✔
111	"""Clean and normalize extracted PDF text.
112
113	Performs:
114	- Collapse multiple newlines
115	- Remove lines containing only numbers (page numbers)
116	- Split concatenated words with punctuation and numbers
117	- Collapse multiple spaces
118	- Remove zero-width / non-breaking characters
119
120	Args:
121	text (str): Raw text extracted from PDF page.
122
123	Returns:
124	str: Cleaned and normalized text.
125	"""
126	if not text:	5✔
NEW 127	return ""	×
128	text = MULTIPLE_NEWLINE_PATTERN.sub("\n", text)	5✔
129	text = HEADING_OR_LIST_PATTERN.sub(r"\1 ", text)	5✔
130	text = STANDALONE_NUMBER_PATTERN.sub("", text)	5✔
131	text = PAGE_PATTERN.sub("", text)	5✔
132	return text	5✔
133
134	def extract_metadata(self) -> dict[str, Any]:	5✔
135	"""Extracts metadata from the PDF document's information dictionary.
136
137	Includes source path, page count, and PDF info fields.
138
139	Returns:
140	dict[str, Any]: A dictionary containing metadata fields:
141	- title
142	- author
143	- creator
144	- producer
145	- publisher
146	- created
147	- modified
148	"""
149	from pdfminer.pdfdocument import PDFDocument	5✔
150	from pdfminer.pdfpage import PDFPage	5✔
151	from pdfminer.pdfparser import PDFParser	5✔
152
153	metadata = {"source": str(self.file_path), "page_count": 0}	5✔
154	with open(self.file_path, "rb") as f:	5✔
155	# Initialize parser on the file stream
156	parser = PDFParser(f)	5✔
157
158	# PDFDocument reads file structure, consuming the file pointer
159	doc = PDFDocument(parser)	5✔
160
161	# Reset pointer to start of file stream for accurate page counting
162	f.seek(0)	5✔
163
164	metadata["page_count"] = ilen(PDFPage.get_pages(f))	5✔
165	metadata.update(self._extract_info_metadata(doc))	5✔
166
167	return metadata	5✔
168
169	def _extract_info_metadata(self, doc: Any) -> dict:	5✔
170	"""Extract metadata from PDF document info dictionary.
171
172	Reads PDF info fields and extracts standardized metadata fields
173	defined in METADATA_FIELDS.
174
175	Args:
176	doc (Any): PDFDocument instance with info attribute.
177
178	Returns:
179	dict: Dictionary of normalized metadata key-value pairs.
180	"""
181	metadata = {}	5✔
182	if not (hasattr(doc, "info") and doc.info):	5✔
NEW 183	return metadata	×
184
185	for info in doc.info:	5✔
186	for k, v in info.items():	5✔
187	k = self.PDF_METADATA_KEY_MAP.get(	5✔
188	self._safe_decode(k), self._safe_decode(k)
189	)
190	v = self._safe_decode(v)	5✔
191	if k.lower() in self.METADATA_FIELDS:	5✔
192	metadata[k.lower()] = v	5✔
193	return metadata	5✔
194
195	def _safe_decode(self, value: str \| bytes):	5✔
196	"""Utility to decode bytes to str, ignoring errors, otherwise return as-is.
197
198	Args:
199	value (str \| bytes): The input value, which may be a string or a byte sequence.
200
201	Returns:
202	str: The decoded string if the input was bytes, or the original string
203	if the input was already a string.
204	"""
205	if isinstance(value, bytes):	5✔
206	return value.decode("utf-8", "ignore")	5✔
207	return value	5✔
208
209
210	# --- Example usage ---
211	if __name__ == "__main__": # pragma: no cover
212	pdf_file = "samples/sample-pdf-a4-size.pdf"
213
214	processor = PDFProcessor(pdf_file)
215	meta = processor.extract_metadata()
216
217	print("Metadata:")
218	for k, v in meta.items():
219	print(f"{k}: {v}")
220
221	print("\nText content preview:\n")
222	for i, page_text in enumerate(processor.extract_text(), start=1):
223	print(f"--- page {i} ---")
224	print(page_text[:512], "...")
225	print("\n --- \n")

speedyk-005 / chunklet-py / 23031162841

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous