• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 12298066419

12 Dec 2024 02:07PM UTC coverage: 90.408% (+0.06%) from 90.346%
12298066419

Pull #8522

github

web-flow
Merge 669550d36 into 04fc187bc
Pull Request #8522: feat: Add XLSXToDocument converter

8096 of 8955 relevant lines covered (90.41%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.95
haystack/components/converters/pypdf.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
import io
1✔
6
import os
1✔
7
from enum import Enum
1✔
8
from pathlib import Path
1✔
9
from typing import Any, Dict, List, Optional, Union
1✔
10

11
from haystack import Document, component, default_from_dict, default_to_dict, logging
1✔
12
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
1✔
13
from haystack.dataclasses import ByteStream
1✔
14
from haystack.lazy_imports import LazyImport
1✔
15

16
with LazyImport("Run 'pip install pypdf'") as pypdf_import:
1✔
17
    from pypdf import PdfReader
1✔
18

19

20
logger = logging.getLogger(__name__)
1✔
21

22

23
class PyPDFExtractionMode(Enum):
1✔
24
    """
25
    The mode to use for extracting text from a PDF.
26
    """
27

28
    PLAIN = "plain"
1✔
29
    LAYOUT = "layout"
1✔
30

31
    def __str__(self) -> str:
1✔
32
        """
33
        Convert a PyPDFExtractionMode enum to a string.
34
        """
35
        return self.value
1✔
36

37
    @staticmethod
1✔
38
    def from_str(string: str) -> "PyPDFExtractionMode":
1✔
39
        """
40
        Convert a string to a PyPDFExtractionMode enum.
41
        """
42
        enum_map = {e.value: e for e in PyPDFExtractionMode}
1✔
43
        mode = enum_map.get(string)
1✔
44
        if mode is None:
1✔
45
            msg = f"Unknown extraction mode '{string}'. Supported modes are: {list(enum_map.keys())}"
1✔
46
            raise ValueError(msg)
1✔
47
        return mode
1✔
48

49

50
@component
1✔
51
class PyPDFToDocument:
1✔
52
    """
53
    Converts PDF files to documents your pipeline can query.
54

55
    This component uses the PyPDF library.
56
    You can attach metadata to the resulting documents.
57

58
    ### Usage example
59

60
    ```python
61
    from haystack.components.converters.pypdf import PyPDFToDocument
62

63
    converter = PyPDFToDocument()
64
    results = converter.run(sources=["sample.pdf"], meta={"date_added": datetime.now().isoformat()})
65
    documents = results["documents"]
66
    print(documents[0].content)
67
    # 'This is a text from the PDF file.'
68
    ```
69
    """
70

71
    def __init__(
1✔
72
        self,
73
        *,
74
        extraction_mode: Union[str, PyPDFExtractionMode] = PyPDFExtractionMode.PLAIN,
75
        plain_mode_orientations: tuple = (0, 90, 180, 270),
76
        plain_mode_space_width: float = 200.0,
77
        layout_mode_space_vertically: bool = True,
78
        layout_mode_scale_weight: float = 1.25,
79
        layout_mode_strip_rotated: bool = True,
80
        layout_mode_font_height_weight: float = 1.0,
81
        store_full_path: bool = False,
82
    ):
83
        """
84
        Create an PyPDFToDocument component.
85

86
        :param extraction_mode:
87
            The mode to use for extracting text from a PDF.
88
            Layout mode is an experimental mode that adheres to the rendered layout of the PDF.
89
        :param plain_mode_orientations:
90
            Tuple of orientations to look for when extracting text from a PDF in plain mode.
91
            Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`.
92
        :param plain_mode_space_width:
93
            Forces default space width if not extracted from font.
94
            Ignored if `extraction_mode` is `PyPDFExtractionMode.LAYOUT`.
95
        :param layout_mode_space_vertically:
96
            Whether to include blank lines inferred from y distance + font height.
97
            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
98
        :param layout_mode_scale_weight:
99
            Multiplier for string length when calculating weighted average character width.
100
            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
101
        :param layout_mode_strip_rotated:
102
            Layout mode does not support rotated text. Set to `False` to include rotated text anyway.
103
            If rotated text is discovered, layout will be degraded and a warning will be logged.
104
            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
105
        :param layout_mode_font_height_weight:
106
            Multiplier for font height when calculating blank line height.
107
            Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
108
        :param store_full_path:
109
            If True, the full path of the file is stored in the metadata of the document.
110
            If False, only the file name is stored.
111
        """
112
        pypdf_import.check()
1✔
113

114
        self.store_full_path = store_full_path
1✔
115

116
        if isinstance(extraction_mode, str):
1✔
117
            extraction_mode = PyPDFExtractionMode.from_str(extraction_mode)
1✔
118
        self.extraction_mode = extraction_mode
1✔
119
        self.plain_mode_orientations = plain_mode_orientations
1✔
120
        self.plain_mode_space_width = plain_mode_space_width
1✔
121
        self.layout_mode_space_vertically = layout_mode_space_vertically
1✔
122
        self.layout_mode_scale_weight = layout_mode_scale_weight
1✔
123
        self.layout_mode_strip_rotated = layout_mode_strip_rotated
1✔
124
        self.layout_mode_font_height_weight = layout_mode_font_height_weight
1✔
125

126
    def to_dict(self):
1✔
127
        """
128
        Serializes the component to a dictionary.
129

130
        :returns:
131
            Dictionary with serialized data.
132
        """
133
        return default_to_dict(
1✔
134
            self,
135
            extraction_mode=str(self.extraction_mode),
136
            plain_mode_orientations=self.plain_mode_orientations,
137
            plain_mode_space_width=self.plain_mode_space_width,
138
            layout_mode_space_vertically=self.layout_mode_space_vertically,
139
            layout_mode_scale_weight=self.layout_mode_scale_weight,
140
            layout_mode_strip_rotated=self.layout_mode_strip_rotated,
141
            layout_mode_font_height_weight=self.layout_mode_font_height_weight,
142
            store_full_path=self.store_full_path,
143
        )
144

145
    @classmethod
1✔
146
    def from_dict(cls, data):
1✔
147
        """
148
        Deserializes the component from a dictionary.
149

150
        :param data:
151
            Dictionary with serialized data.
152

153
        :returns:
154
            Deserialized component.
155
        """
156
        return default_from_dict(cls, data)
1✔
157

158
    def _default_convert(self, reader: "PdfReader") -> Document:
1✔
159
        texts = []
1✔
160
        for page in reader.pages:
1✔
161
            texts.append(
1✔
162
                page.extract_text(
163
                    orientations=self.plain_mode_orientations,
164
                    extraction_mode=self.extraction_mode.value,
165
                    space_width=self.plain_mode_space_width,
166
                    layout_mode_space_vertically=self.layout_mode_space_vertically,
167
                    layout_mode_scale_weight=self.layout_mode_scale_weight,
168
                    layout_mode_strip_rotated=self.layout_mode_strip_rotated,
169
                    layout_mode_font_height_weight=self.layout_mode_font_height_weight,
170
                )
171
            )
172
        text = "\f".join(texts)
1✔
173
        return Document(content=text)
1✔
174

175
    @component.output_types(documents=List[Document])
1✔
176
    def run(
1✔
177
        self,
178
        sources: List[Union[str, Path, ByteStream]],
179
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
180
    ):
181
        """
182
        Converts PDF files to documents.
183

184
        :param sources:
185
            List of file paths or ByteStream objects to convert.
186
        :param meta:
187
            Optional metadata to attach to the documents.
188
            This value can be a list of dictionaries or a single dictionary.
189
            If it's a single dictionary, its content is added to the metadata of all produced documents.
190
            If it's a list, its length must match the number of sources, as they are zipped together.
191
            For ByteStream objects, their `meta` is added to the output documents.
192

193
        :returns:
194
            A dictionary with the following keys:
195
            - `documents`: A list of converted documents.
196
        """
197
        documents = []
1✔
198
        meta_list = normalize_metadata(meta, sources_count=len(sources))
1✔
199

200
        for source, metadata in zip(sources, meta_list):
1✔
201
            try:
1✔
202
                bytestream = get_bytestream_from_source(source)
1✔
203
            except Exception as e:
1✔
204
                logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
1✔
205
                continue
1✔
206
            try:
1✔
207
                pdf_reader = PdfReader(io.BytesIO(bytestream.data))
1✔
208
                document = self._default_convert(pdf_reader)
1✔
209
            except Exception as e:
×
210
                logger.warning(
×
211
                    "Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
212
                )
213
                continue
×
214

215
            if document.content is None or document.content.strip() == "":
1✔
216
                logger.warning(
1✔
217
                    "PyPDFToDocument could not extract text from the file {source}. Returning an empty document.",
218
                    source=source,
219
                )
220

221
            merged_metadata = {**bytestream.meta, **metadata}
1✔
222

223
            if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
1✔
224
                merged_metadata["file_path"] = os.path.basename(file_path)
1✔
225
            document.meta = merged_metadata
1✔
226
            documents.append(document)
1✔
227

228
        return {"documents": documents}
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc