15191258343

Committed 22 May 2025 03:56PM UTC coverage: 90.344% (-0.03%) from 90.37%

Build # 15191258343

Build Type

Pull #9420

github

Committed by

web-flow

Commit Message

Merge 8f67d0123 into 4a5e4d3e6

Pull Request Pull Request #9420: typing: adding stub files for the SuperComponents in haystack

Run Details

11153 of 12345 relevant lines covered (90.34%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.23

haystack/components/converters/multi_file_converter.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING, Any, Optional, Union

from haystack import Document, Pipeline, super_component
from haystack.components.converters import (
    CSVToDocument,
    DOCXToDocument,
    HTMLToDocument,
    JSONConverter,
    PPTXToDocument,
    PyPDFToDocument,
    TextFileToDocument,
    XLSXToDocument,
)
from haystack.components.joiners import DocumentJoiner
from haystack.components.routers import FileTypeRouter
from haystack.dataclasses import ByteStream


class ConverterMimeType(str, Enum):
    CSV = "text/csv"
    DOCX = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    HTML = "text/html"
    JSON = "application/json"
    MD = "text/markdown"
    TEXT = "text/plain"
    PDF = "application/pdf"
    PPTX = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
    XLSX = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"


@super_component
class MultiFileConverter:
    """
    A file converter that handles conversion of multiple file types.

    The MultiFileConverter handles the following file types:
    - CSV
    - DOCX
    - HTML
    - JSON
    - MD
    - TEXT
    - PDF (no OCR)
    - PPTX
    - XLSX

    Usage example:
    ```
    from haystack.super_components.converters import MultiFileConverter

    converter = MultiFileConverter()
    converter.run(sources=["test.txt", "test.pdf"], meta={})
    ```
    """

    def __init__(self, encoding: str = "utf-8", json_content_key: str = "content") -> None:
        """
        Initialize the MultiFileConverter.

        :param encoding: The encoding to use when reading files.
        :param json_content_key: The key to use in a content field in a document when converting JSON files.
        """
        self.encoding = encoding
        self.json_content_key = json_content_key

        # initialize components
        router = FileTypeRouter(
            mime_types=[mime_type.value for mime_type in ConverterMimeType],
            # Ensure common extensions are registered. Tests on Windows fail otherwise.
            additional_mimetypes={
                "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
                "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
            },
        )

        # Create pipeline and add components
        pp = Pipeline()

        pp.add_component("router", router)
        pp.add_component("docx", DOCXToDocument(link_format="markdown"))
        pp.add_component(
            "html",
            HTMLToDocument(
                extraction_kwargs={"output_format": "markdown", "include_tables": True, "include_links": True}
            ),
        )
        pp.add_component("json", JSONConverter(content_key=self.json_content_key))
        pp.add_component("md", TextFileToDocument(encoding=self.encoding))
        pp.add_component("text", TextFileToDocument(encoding=self.encoding))
        pp.add_component("pdf", PyPDFToDocument())
        pp.add_component("pptx", PPTXToDocument())
        pp.add_component("xlsx", XLSXToDocument())
        pp.add_component("joiner", DocumentJoiner())
        pp.add_component("csv", CSVToDocument(encoding=self.encoding))

        for mime_type in ConverterMimeType:
            pp.connect(f"router.{mime_type.value}", str(mime_type).lower().rsplit(".", maxsplit=1)[-1])

        pp.connect("docx.documents", "joiner.documents")
        pp.connect("html.documents", "joiner.documents")
        pp.connect("json.documents", "joiner.documents")
        pp.connect("md.documents", "joiner.documents")
        pp.connect("text.documents", "joiner.documents")
        pp.connect("pdf.documents", "joiner.documents")
        pp.connect("pptx.documents", "joiner.documents")

        pp.connect("csv.documents", "joiner.documents")
        pp.connect("xlsx.documents", "joiner.documents")

        self.pipeline = pp
        self.output_mapping = {"joiner.documents": "documents", "router.unclassified": "unclassified"}
        self.input_mapping = {"sources": ["router.sources"], "meta": ["router.meta"]}

    if TYPE_CHECKING:
        # fake method, never executed, but static analyzers will not complain about missing method
        def run(  # noqa: D102
            self,
            *sources: list[Union[str, Path, ByteStream]],
            meta: Optional[Union[dict[str, Any], list[dict[str, Any]]]] = None,
        ) -> dict[str, list[Document]]:  # noqa: D102
            ...

1	# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2	#
3	# SPDX-License-Identifier: Apache-2.0
4
5	from enum import Enum	1✔
6	from pathlib import Path	1✔
7	from typing import TYPE_CHECKING, Any, Optional, Union	1✔
8
9	from haystack import Document, Pipeline, super_component	1✔
10	from haystack.components.converters import (	1✔
11	CSVToDocument,
12	DOCXToDocument,
13	HTMLToDocument,
14	JSONConverter,
15	PPTXToDocument,
16	PyPDFToDocument,
17	TextFileToDocument,
18	XLSXToDocument,
19	)
20	from haystack.components.joiners import DocumentJoiner	1✔
21	from haystack.components.routers import FileTypeRouter	1✔
22	from haystack.dataclasses import ByteStream	1✔
23
24
25	class ConverterMimeType(str, Enum):	1✔
26	CSV = "text/csv"	1✔
27	DOCX = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"	1✔
28	HTML = "text/html"	1✔
29	JSON = "application/json"	1✔
30	MD = "text/markdown"	1✔
31	TEXT = "text/plain"	1✔
32	PDF = "application/pdf"	1✔
33	PPTX = "application/vnd.openxmlformats-officedocument.presentationml.presentation"	1✔
34	XLSX = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"	1✔
35
36
37	@super_component	1✔
38	class MultiFileConverter:	1✔
39	"""
40	A file converter that handles conversion of multiple file types.
41
42	The MultiFileConverter handles the following file types:
43	- CSV
44	- DOCX
45	- HTML
46	- JSON
47	- MD
48	- TEXT
49	- PDF (no OCR)
50	- PPTX
51	- XLSX
52
53	Usage example:
54	```
55	from haystack.super_components.converters import MultiFileConverter
56
57	converter = MultiFileConverter()
58	converter.run(sources=["test.txt", "test.pdf"], meta={})
59	```
60	"""
61
62	def __init__(self, encoding: str = "utf-8", json_content_key: str = "content") -> None:	1✔
63	"""
64	Initialize the MultiFileConverter.
65
66	:param encoding: The encoding to use when reading files.
67	:param json_content_key: The key to use in a content field in a document when converting JSON files.
68	"""
69	self.encoding = encoding	1✔
70	self.json_content_key = json_content_key	1✔
71
72	# initialize components
73	router = FileTypeRouter(	1✔
74	mime_types=[mime_type.value for mime_type in ConverterMimeType],
75	# Ensure common extensions are registered. Tests on Windows fail otherwise.
76	additional_mimetypes={
77	"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
78	"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
79	"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
80	},
81	)
82
83	# Create pipeline and add components
84	pp = Pipeline()	1✔
85
86	pp.add_component("router", router)	1✔
87	pp.add_component("docx", DOCXToDocument(link_format="markdown"))	1✔
88	pp.add_component(	1✔
89	"html",
90	HTMLToDocument(
91	extraction_kwargs={"output_format": "markdown", "include_tables": True, "include_links": True}
92	),
93	)
94	pp.add_component("json", JSONConverter(content_key=self.json_content_key))	1✔
95	pp.add_component("md", TextFileToDocument(encoding=self.encoding))	1✔
96	pp.add_component("text", TextFileToDocument(encoding=self.encoding))	1✔
97	pp.add_component("pdf", PyPDFToDocument())	1✔
98	pp.add_component("pptx", PPTXToDocument())	1✔
99	pp.add_component("xlsx", XLSXToDocument())	1✔
100	pp.add_component("joiner", DocumentJoiner())	1✔
101	pp.add_component("csv", CSVToDocument(encoding=self.encoding))	1✔
102
103	for mime_type in ConverterMimeType:	1✔
104	pp.connect(f"router.{mime_type.value}", str(mime_type).lower().rsplit(".", maxsplit=1)[-1])	1✔
105
106	pp.connect("docx.documents", "joiner.documents")	1✔
107	pp.connect("html.documents", "joiner.documents")	1✔
108	pp.connect("json.documents", "joiner.documents")	1✔
109	pp.connect("md.documents", "joiner.documents")	1✔
110	pp.connect("text.documents", "joiner.documents")	1✔
111	pp.connect("pdf.documents", "joiner.documents")	1✔
112	pp.connect("pptx.documents", "joiner.documents")	1✔
113
114	pp.connect("csv.documents", "joiner.documents")	1✔
115	pp.connect("xlsx.documents", "joiner.documents")	1✔
116
117	self.pipeline = pp	1✔
118	self.output_mapping = {"joiner.documents": "documents", "router.unclassified": "unclassified"}	1✔
119	self.input_mapping = {"sources": ["router.sources"], "meta": ["router.meta"]}	1✔
120
121	if TYPE_CHECKING:	1✔
122	# fake method, never executed, but static analyzers will not complain about missing method
123	def run( # noqa: D102	×
124	self,
125	*sources: list[Union[str, Path, ByteStream]],
126	meta: Optional[Union[dict[str, Any], list[dict[str, Any]]]] = None,
127	) -> dict[str, list[Document]]: # noqa: D102
128	...	×

deepset-ai / haystack / 15191258343

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous