12298066419

Committed 12 Dec 2024 02:07PM UTC coverage: 90.408% (+0.06%) from 90.346%

Build # 12298066419

Build Type

Pull #8522

github

Committed by

web-flow

Commit Message

Merge 669550d36 into 04fc187bc

Pull Request Pull Request #8522: feat: Add XLSXToDocument converter

Run Details

8096 of 8955 relevant lines covered (90.41%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

91.18

haystack/components/converters/txt.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

import os
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

from haystack import Document, component, logging
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
from haystack.dataclasses import ByteStream

logger = logging.getLogger(__name__)


@component
class TextFileToDocument:
    """
    Converts text files to documents your pipeline can query.

    By default, it uses UTF-8 encoding when converting files but
    you can also set custom encoding.
    It can attach metadata to the resulting documents.

    ### Usage example

    ```python
    from haystack.components.converters.txt import TextFileToDocument

    converter = TextFileToDocument()
    results = converter.run(sources=["sample.txt"])
    documents = results["documents"]
    print(documents[0].content)
    # 'This is the content from the txt file.'
    ```
    """

    def __init__(self, encoding: str = "utf-8", store_full_path: bool = False):
        """
        Creates a TextFileToDocument component.

        :param encoding:
            The encoding of the text files to convert.
            If the encoding is specified in the metadata of a source ByteStream,
            it overrides this value.
        :param store_full_path:
            If True, the full path of the file is stored in the metadata of the document.
            If False, only the file name is stored.
        """
        self.encoding = encoding
        self.store_full_path = store_full_path

    @component.output_types(documents=List[Document])
    def run(
        self,
        sources: List[Union[str, Path, ByteStream]],
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
    ):
        """
        Converts text files to documents.

        :param sources:
            List of HTML file paths or ByteStream objects to convert.
        :param meta:
            Optional metadata to attach to the documents.
            This value can be a list of dictionaries or a single dictionary.
            If it's a single dictionary, its content is added to the metadata of all produced documents.
            If it's a list, its length must match the number of sources as they're zipped together.
            For ByteStream objects, their `meta` is added to the output documents.

        :returns:
            A dictionary with the following keys:
            - `documents`: A list of converted documents.
        """
        documents = []

        meta_list = normalize_metadata(meta, sources_count=len(sources))

        for source, metadata in zip(sources, meta_list):
            try:
                bytestream = get_bytestream_from_source(source)
            except Exception as e:
                logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
                continue
            try:
                encoding = bytestream.meta.get("encoding", self.encoding)
                text = bytestream.data.decode(encoding)
            except Exception as e:
                logger.warning(
                    "Could not convert file {source}. Skipping it. Error message: {error}", source=source, error=e
                )
                continue

            merged_metadata = {**bytestream.meta, **metadata}

            if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
                merged_metadata["file_path"] = os.path.basename(file_path)
            document = Document(content=text, meta=merged_metadata)
            documents.append(document)

        return {"documents": documents}

1	# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2	#
3	# SPDX-License-Identifier: Apache-2.0
4
5	import os	1✔
6	from pathlib import Path	1✔
7	from typing import Any, Dict, List, Optional, Union	1✔
8
9	from haystack import Document, component, logging	1✔
10	from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata	1✔
11	from haystack.dataclasses import ByteStream	1✔
12
13	logger = logging.getLogger(__name__)	1✔
14
15
16	@component	1✔
17	class TextFileToDocument:	1✔
18	"""
19	Converts text files to documents your pipeline can query.
20
21	By default, it uses UTF-8 encoding when converting files but
22	you can also set custom encoding.
23	It can attach metadata to the resulting documents.
24
25	### Usage example
26
27	```python
28	from haystack.components.converters.txt import TextFileToDocument
29
30	converter = TextFileToDocument()
31	results = converter.run(sources=["sample.txt"])
32	documents = results["documents"]
33	print(documents[0].content)
34	# 'This is the content from the txt file.'
35	```
36	"""
37
38	def __init__(self, encoding: str = "utf-8", store_full_path: bool = False):	1✔
39	"""
40	Creates a TextFileToDocument component.
41
42	:param encoding:
43	The encoding of the text files to convert.
44	If the encoding is specified in the metadata of a source ByteStream,
45	it overrides this value.
46	:param store_full_path:
47	If True, the full path of the file is stored in the metadata of the document.
48	If False, only the file name is stored.
49	"""
50	self.encoding = encoding	1✔
51	self.store_full_path = store_full_path	1✔
52
53	@component.output_types(documents=List[Document])	1✔
54	def run(	1✔
55	self,
56	sources: List[Union[str, Path, ByteStream]],
57	meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
58	):
59	"""
60	Converts text files to documents.
61
62	:param sources:
63	List of HTML file paths or ByteStream objects to convert.
64	:param meta:
65	Optional metadata to attach to the documents.
66	This value can be a list of dictionaries or a single dictionary.
67	If it's a single dictionary, its content is added to the metadata of all produced documents.
68	If it's a list, its length must match the number of sources as they're zipped together.
69	For ByteStream objects, their `meta` is added to the output documents.
70
71	:returns:
72	A dictionary with the following keys:
73	- `documents`: A list of converted documents.
74	"""
75	documents = []	1✔
76
77	meta_list = normalize_metadata(meta, sources_count=len(sources))	1✔
78
79	for source, metadata in zip(sources, meta_list):	1✔
80	try:	1✔
81	bytestream = get_bytestream_from_source(source)	1✔
82	except Exception as e:	1✔
83	logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)	1✔
84	continue	1✔
85	try:	1✔
86	encoding = bytestream.meta.get("encoding", self.encoding)	1✔
87	text = bytestream.data.decode(encoding)	1✔
88	except Exception as e:	×
89	logger.warning(	×
90	"Could not convert file {source}. Skipping it. Error message: {error}", source=source, error=e
91	)
92	continue	×
93
94	merged_metadata = {bytestream.meta, metadata}	1✔
95
96	if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):	1✔
97	merged_metadata["file_path"] = os.path.basename(file_path)	1✔
98	document = Document(content=text, meta=merged_metadata)	1✔
99	documents.append(document)	1✔
100
101	return {"documents": documents}	1✔

deepset-ai / haystack / 12298066419

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous