12298066419

Committed 12 Dec 2024 02:07PM UTC coverage: 90.408% (+0.06%) from 90.346%

Build # 12298066419

Build Type

Pull #8522

github

Committed by

web-flow

Commit Message

Merge 669550d36 into 04fc187bc

Pull Request Pull Request #8522: feat: Add XLSXToDocument converter

Run Details

8096 of 8955 relevant lines covered (90.41%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.95

haystack/components/converters/pypdf.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

import io
import os
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

from haystack import Document, component, default_from_dict, default_to_dict, logging
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
from haystack.dataclasses import ByteStream
from haystack.lazy_imports import LazyImport

with LazyImport("Run 'pip install pypdf'") as pypdf_import:
    from pypdf import PdfReader


logger = logging.getLogger(__name__)


class PyPDFExtractionMode(Enum):
    """
    The mode to use for extracting text from a PDF.
    """

    PLAIN = "plain"
    LAYOUT = "layout"

    def __str__(self) -> str:
        """
        Convert a PyPDFExtractionMode enum to a string.
        """
        return self.value

    @staticmethod
    def from_str(string: str) -> "PyPDFExtractionMode":
        """
        Convert a string to a PyPDFExtractionMode enum.
        """
        enum_map = {e.value: e for e in PyPDFExtractionMode}
        mode = enum_map.get(string)
        if mode is None:
            msg = f"Unknown extraction mode '{string}'. Supported modes are: {list(enum_map.keys())}"
            raise ValueError(msg)
        return mode


@component
class PyPDFToDocument:
    """
    Converts PDF files to documents your pipeline can query.

    This component uses the PyPDF library.
    You can attach metadata to the resulting documents.

    ### Usage example

    ```python
    from haystack.components.converters.pypdf import PyPDFToDocument

    converter = PyPDFToDocument()
    results = converter.run(sources=["sample.pdf"], meta={"date_added": datetime.now().isoformat()})
    documents = results["documents"]
    print(documents[0].content)
    # 'This is a text from the PDF file.'
    ```
    """

    def __init__(
        self,
        *,
        extraction_mode: Union[str, PyPDFExtractionMode] = PyPDFExtractionMode.PLAIN,
        plain_mode_orientations: tuple = (0, 90, 180, 270),
        plain_mode_space_width: float = 200.0,
        layout_mode_space_vertically: bool = True,
        layout_mode_scale_weight: float = 1.25,
        layout_mode_strip_rotated: bool = True,
        layout_mode_font_height_weight: float = 1.0,
        store_full_path: bool = False,
    ):
        """
        Create an PyPDFToDocument component.

        :param extraction_mode:
            The mode to use for extracting text from a PDF.
            Layout mode is an experimental mode that adheres to the rendered layout of the PDF.
        :param plain_mode_orientations:
            Tuple of orientations to look for when extracting text from a PDF in plain mode.
            Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`.
        :param plain_mode_space_width:
            Forces default space width if not extracted from font.
            Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`.
        :param layout_mode_space_vertically:
            Whether to include blank lines inferred from y distance + font height.
            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
        :param layout_mode_scale_weight:
            Multiplier for string length when calculating weighted average character width.
            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
        :param layout_mode_strip_rotated:
            Layout mode does not support rotated text. Set to `False` to include rotated text anyway.
            If rotated text is discovered, layout will be degraded and a warning will be logged.
            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
        :param layout_mode_font_height_weight:
            Multiplier for font height when calculating blank line height.
            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
        :param store_full_path:
            If True, the full path of the file is stored in the metadata of the document.
            If False, only the file name is stored.
        """
        pypdf_import.check()

        self.store_full_path = store_full_path

        if isinstance(extraction_mode, str):
            extraction_mode = PyPDFExtractionMode.from_str(extraction_mode)
        self.extraction_mode = extraction_mode
        self.plain_mode_orientations = plain_mode_orientations
        self.plain_mode_space_width = plain_mode_space_width
        self.layout_mode_space_vertically = layout_mode_space_vertically
        self.layout_mode_scale_weight = layout_mode_scale_weight
        self.layout_mode_strip_rotated = layout_mode_strip_rotated
        self.layout_mode_font_height_weight = layout_mode_font_height_weight

    def to_dict(self):
        """
        Serializes the component to a dictionary.

        :returns:
            Dictionary with serialized data.
        """
        return default_to_dict(
            self,
            extraction_mode=str(self.extraction_mode),
            plain_mode_orientations=self.plain_mode_orientations,
            plain_mode_space_width=self.plain_mode_space_width,
            layout_mode_space_vertically=self.layout_mode_space_vertically,
            layout_mode_scale_weight=self.layout_mode_scale_weight,
            layout_mode_strip_rotated=self.layout_mode_strip_rotated,
            layout_mode_font_height_weight=self.layout_mode_font_height_weight,
            store_full_path=self.store_full_path,
        )

    @classmethod
    def from_dict(cls, data):
        """
        Deserializes the component from a dictionary.

        :param data:
            Dictionary with serialized data.

        :returns:
            Deserialized component.
        """
        return default_from_dict(cls, data)

    def _default_convert(self, reader: "PdfReader") -> Document:
        texts = []
        for page in reader.pages:
            texts.append(
                page.extract_text(
                    orientations=self.plain_mode_orientations,
                    extraction_mode=self.extraction_mode.value,
                    space_width=self.plain_mode_space_width,
                    layout_mode_space_vertically=self.layout_mode_space_vertically,
                    layout_mode_scale_weight=self.layout_mode_scale_weight,
                    layout_mode_strip_rotated=self.layout_mode_strip_rotated,
                    layout_mode_font_height_weight=self.layout_mode_font_height_weight,
                )
            )
        text = "\f".join(texts)
        return Document(content=text)

    @component.output_types(documents=List[Document])
    def run(
        self,
        sources: List[Union[str, Path, ByteStream]],
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
    ):
        """
        Converts PDF files to documents.

        :param sources:
            List of file paths or ByteStream objects to convert.
        :param meta:
            Optional metadata to attach to the documents.
            This value can be a list of dictionaries or a single dictionary.
            If it's a single dictionary, its content is added to the metadata of all produced documents.
            If it's a list, its length must match the number of sources, as they are zipped together.
            For ByteStream objects, their `meta` is added to the output documents.

        :returns:
            A dictionary with the following keys:
            - `documents`: A list of converted documents.
        """
        documents = []
        meta_list = normalize_metadata(meta, sources_count=len(sources))

        for source, metadata in zip(sources, meta_list):
            try:
                bytestream = get_bytestream_from_source(source)
            except Exception as e:
                logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
                continue
            try:
                pdf_reader = PdfReader(io.BytesIO(bytestream.data))
                document = self._default_convert(pdf_reader)
            except Exception as e:
                logger.warning(
                    "Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
                )
                continue

            if document.content is None or document.content.strip() == "":
                logger.warning(
                    "PyPDFToDocument could not extract text from the file {source}. Returning an empty document.",
                    source=source,
                )

            merged_metadata = {**bytestream.meta, **metadata}

            if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
                merged_metadata["file_path"] = os.path.basename(file_path)
            document.meta = merged_metadata
            documents.append(document)

        return {"documents": documents}

1	# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2	#
3	# SPDX-License-Identifier: Apache-2.0
4
5	import io	1✔
6	import os	1✔
7	from enum import Enum	1✔
8	from pathlib import Path	1✔
9	from typing import Any, Dict, List, Optional, Union	1✔
10
11	from haystack import Document, component, default_from_dict, default_to_dict, logging	1✔
12	from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata	1✔
13	from haystack.dataclasses import ByteStream	1✔
14	from haystack.lazy_imports import LazyImport	1✔
15
16	with LazyImport("Run 'pip install pypdf'") as pypdf_import:	1✔
17	from pypdf import PdfReader	1✔
18
19
20	logger = logging.getLogger(__name__)	1✔
21
22
23	class PyPDFExtractionMode(Enum):	1✔
24	"""
25	The mode to use for extracting text from a PDF.
26	"""
27
28	PLAIN = "plain"	1✔
29	LAYOUT = "layout"	1✔
30
31	def __str__(self) -> str:	1✔
32	"""
33	Convert a PyPDFExtractionMode enum to a string.
34	"""
35	return self.value	1✔
36
37	@staticmethod	1✔
38	def from_str(string: str) -> "PyPDFExtractionMode":	1✔
39	"""
40	Convert a string to a PyPDFExtractionMode enum.
41	"""
42	enum_map = {e.value: e for e in PyPDFExtractionMode}	1✔
43	mode = enum_map.get(string)	1✔
44	if mode is None:	1✔
45	msg = f"Unknown extraction mode '{string}'. Supported modes are: {list(enum_map.keys())}"	1✔
46	raise ValueError(msg)	1✔
47	return mode	1✔
48
49
50	@component	1✔
51	class PyPDFToDocument:	1✔
52	"""
53	Converts PDF files to documents your pipeline can query.
54
55	This component uses the PyPDF library.
56	You can attach metadata to the resulting documents.
57
58	### Usage example
59
60	```python
61	from haystack.components.converters.pypdf import PyPDFToDocument
62
63	converter = PyPDFToDocument()
64	results = converter.run(sources=["sample.pdf"], meta={"date_added": datetime.now().isoformat()})
65	documents = results["documents"]
66	print(documents[0].content)
67	# 'This is a text from the PDF file.'
68	```
69	"""
70
71	def __init__(	1✔
72	self,
73	*,
74	extraction_mode: Union[str, PyPDFExtractionMode] = PyPDFExtractionMode.PLAIN,
75	plain_mode_orientations: tuple = (0, 90, 180, 270),
76	plain_mode_space_width: float = 200.0,
77	layout_mode_space_vertically: bool = True,
78	layout_mode_scale_weight: float = 1.25,
79	layout_mode_strip_rotated: bool = True,
80	layout_mode_font_height_weight: float = 1.0,
81	store_full_path: bool = False,
82	):
83	"""
84	Create an PyPDFToDocument component.
85
86	:param extraction_mode:
87	The mode to use for extracting text from a PDF.
88	Layout mode is an experimental mode that adheres to the rendered layout of the PDF.
89	:param plain_mode_orientations:
90	Tuple of orientations to look for when extracting text from a PDF in plain mode.
91	Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`.
92	:param plain_mode_space_width:
93	Forces default space width if not extracted from font.
94	Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`.
95	:param layout_mode_space_vertically:
96	Whether to include blank lines inferred from y distance + font height.
97	Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
98	:param layout_mode_scale_weight:
99	Multiplier for string length when calculating weighted average character width.
100	Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
101	:param layout_mode_strip_rotated:
102	Layout mode does not support rotated text. Set to `False` to include rotated text anyway.
103	If rotated text is discovered, layout will be degraded and a warning will be logged.
104	Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
105	:param layout_mode_font_height_weight:
106	Multiplier for font height when calculating blank line height.
107	Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
108	:param store_full_path:
109	If True, the full path of the file is stored in the metadata of the document.
110	If False, only the file name is stored.
111	"""
112	pypdf_import.check()	1✔
113
114	self.store_full_path = store_full_path	1✔
115
116	if isinstance(extraction_mode, str):	1✔
117	extraction_mode = PyPDFExtractionMode.from_str(extraction_mode)	1✔
118	self.extraction_mode = extraction_mode	1✔
119	self.plain_mode_orientations = plain_mode_orientations	1✔
120	self.plain_mode_space_width = plain_mode_space_width	1✔
121	self.layout_mode_space_vertically = layout_mode_space_vertically	1✔
122	self.layout_mode_scale_weight = layout_mode_scale_weight	1✔
123	self.layout_mode_strip_rotated = layout_mode_strip_rotated	1✔
124	self.layout_mode_font_height_weight = layout_mode_font_height_weight	1✔
125
126	def to_dict(self):	1✔
127	"""
128	Serializes the component to a dictionary.
129
130	:returns:
131	Dictionary with serialized data.
132	"""
133	return default_to_dict(	1✔
134	self,
135	extraction_mode=str(self.extraction_mode),
136	plain_mode_orientations=self.plain_mode_orientations,
137	plain_mode_space_width=self.plain_mode_space_width,
138	layout_mode_space_vertically=self.layout_mode_space_vertically,
139	layout_mode_scale_weight=self.layout_mode_scale_weight,
140	layout_mode_strip_rotated=self.layout_mode_strip_rotated,
141	layout_mode_font_height_weight=self.layout_mode_font_height_weight,
142	store_full_path=self.store_full_path,
143	)
144
145	@classmethod	1✔
146	def from_dict(cls, data):	1✔
147	"""
148	Deserializes the component from a dictionary.
149
150	:param data:
151	Dictionary with serialized data.
152
153	:returns:
154	Deserialized component.
155	"""
156	return default_from_dict(cls, data)	1✔
157
158	def _default_convert(self, reader: "PdfReader") -> Document:	1✔
159	texts = []	1✔
160	for page in reader.pages:	1✔
161	texts.append(	1✔
162	page.extract_text(
163	orientations=self.plain_mode_orientations,
164	extraction_mode=self.extraction_mode.value,
165	space_width=self.plain_mode_space_width,
166	layout_mode_space_vertically=self.layout_mode_space_vertically,
167	layout_mode_scale_weight=self.layout_mode_scale_weight,
168	layout_mode_strip_rotated=self.layout_mode_strip_rotated,
169	layout_mode_font_height_weight=self.layout_mode_font_height_weight,
170	)
171	)
172	text = "\f".join(texts)	1✔
173	return Document(content=text)	1✔
174
175	@component.output_types(documents=List[Document])	1✔
176	def run(	1✔
177	self,
178	sources: List[Union[str, Path, ByteStream]],
179	meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
180	):
181	"""
182	Converts PDF files to documents.
183
184	:param sources:
185	List of file paths or ByteStream objects to convert.
186	:param meta:
187	Optional metadata to attach to the documents.
188	This value can be a list of dictionaries or a single dictionary.
189	If it's a single dictionary, its content is added to the metadata of all produced documents.
190	If it's a list, its length must match the number of sources, as they are zipped together.
191	For ByteStream objects, their `meta` is added to the output documents.
192
193	:returns:
194	A dictionary with the following keys:
195	- `documents`: A list of converted documents.
196	"""
197	documents = []	1✔
198	meta_list = normalize_metadata(meta, sources_count=len(sources))	1✔
199
200	for source, metadata in zip(sources, meta_list):	1✔
201	try:	1✔
202	bytestream = get_bytestream_from_source(source)	1✔
203	except Exception as e:	1✔
204	logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)	1✔
205	continue	1✔
206	try:	1✔
207	pdf_reader = PdfReader(io.BytesIO(bytestream.data))	1✔
208	document = self._default_convert(pdf_reader)	1✔
209	except Exception as e:	×
210	logger.warning(	×
211	"Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
212	)
213	continue	×
214
215	if document.content is None or document.content.strip() == "":	1✔
216	logger.warning(	1✔
217	"PyPDFToDocument could not extract text from the file {source}. Returning an empty document.",
218	source=source,
219	)
220
221	merged_metadata = {bytestream.meta, metadata}	1✔
222
223	if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):	1✔
224	merged_metadata["file_path"] = os.path.basename(file_path)	1✔
225	document.meta = merged_metadata	1✔
226	documents.append(document)	1✔
227
228	return {"documents": documents}	1✔

deepset-ai / haystack / 12298066419

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous