• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 12298066419

12 Dec 2024 02:07PM UTC coverage: 90.408% (+0.06%) from 90.346%
12298066419

Pull #8522

github

web-flow
Merge 669550d36 into 04fc187bc
Pull Request #8522: feat: Add XLSXToDocument converter

8096 of 8955 relevant lines covered (90.41%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

91.18
haystack/components/converters/txt.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
import os
1✔
6
from pathlib import Path
1✔
7
from typing import Any, Dict, List, Optional, Union
1✔
8

9
from haystack import Document, component, logging
1✔
10
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
1✔
11
from haystack.dataclasses import ByteStream
1✔
12

13
logger = logging.getLogger(__name__)
1✔
14

15

16
@component
1✔
17
class TextFileToDocument:
1✔
18
    """
19
    Converts text files to documents your pipeline can query.
20

21
    By default, it uses UTF-8 encoding when converting files but
22
    you can also set custom encoding.
23
    It can attach metadata to the resulting documents.
24

25
    ### Usage example
26

27
    ```python
28
    from haystack.components.converters.txt import TextFileToDocument
29

30
    converter = TextFileToDocument()
31
    results = converter.run(sources=["sample.txt"])
32
    documents = results["documents"]
33
    print(documents[0].content)
34
    # 'This is the content from the txt file.'
35
    ```
36
    """
37

38
    def __init__(self, encoding: str = "utf-8", store_full_path: bool = False):
1✔
39
        """
40
        Creates a TextFileToDocument component.
41

42
        :param encoding:
43
            The encoding of the text files to convert.
44
            If the encoding is specified in the metadata of a source ByteStream,
45
            it overrides this value.
46
        :param store_full_path:
47
            If True, the full path of the file is stored in the metadata of the document.
48
            If False, only the file name is stored.
49
        """
50
        self.encoding = encoding
1✔
51
        self.store_full_path = store_full_path
1✔
52

53
    @component.output_types(documents=List[Document])
1✔
54
    def run(
1✔
55
        self,
56
        sources: List[Union[str, Path, ByteStream]],
57
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
58
    ):
59
        """
60
        Converts text files to documents.
61

62
        :param sources:
63
            List of HTML file paths or ByteStream objects to convert.
64
        :param meta:
65
            Optional metadata to attach to the documents.
66
            This value can be a list of dictionaries or a single dictionary.
67
            If it's a single dictionary, its content is added to the metadata of all produced documents.
68
            If it's a list, its length must match the number of sources as they're zipped together.
69
            For ByteStream objects, their `meta` is added to the output documents.
70

71
        :returns:
72
            A dictionary with the following keys:
73
            - `documents`: A list of converted documents.
74
        """
75
        documents = []
1✔
76

77
        meta_list = normalize_metadata(meta, sources_count=len(sources))
1✔
78

79
        for source, metadata in zip(sources, meta_list):
1✔
80
            try:
1✔
81
                bytestream = get_bytestream_from_source(source)
1✔
82
            except Exception as e:
1✔
83
                logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
1✔
84
                continue
1✔
85
            try:
1✔
86
                encoding = bytestream.meta.get("encoding", self.encoding)
1✔
87
                text = bytestream.data.decode(encoding)
1✔
88
            except Exception as e:
×
89
                logger.warning(
×
90
                    "Could not convert file {source}. Skipping it. Error message: {error}", source=source, error=e
91
                )
92
                continue
×
93

94
            merged_metadata = {**bytestream.meta, **metadata}
1✔
95

96
            if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
1✔
97
                merged_metadata["file_path"] = os.path.basename(file_path)
1✔
98
            document = Document(content=text, meta=merged_metadata)
1✔
99
            documents.append(document)
1✔
100

101
        return {"documents": documents}
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc