• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 15191258343

22 May 2025 03:56PM UTC coverage: 90.344% (-0.03%) from 90.37%
15191258343

Pull #9420

github

web-flow
Merge 8f67d0123 into 4a5e4d3e6
Pull Request #9420: typing: adding stub files for the SuperComponents in haystack

11153 of 12345 relevant lines covered (90.34%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.23
haystack/components/converters/multi_file_converter.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
from enum import Enum
1✔
6
from pathlib import Path
1✔
7
from typing import TYPE_CHECKING, Any, Optional, Union
1✔
8

9
from haystack import Document, Pipeline, super_component
1✔
10
from haystack.components.converters import (
1✔
11
    CSVToDocument,
12
    DOCXToDocument,
13
    HTMLToDocument,
14
    JSONConverter,
15
    PPTXToDocument,
16
    PyPDFToDocument,
17
    TextFileToDocument,
18
    XLSXToDocument,
19
)
20
from haystack.components.joiners import DocumentJoiner
1✔
21
from haystack.components.routers import FileTypeRouter
1✔
22
from haystack.dataclasses import ByteStream
1✔
23

24

25
class ConverterMimeType(str, Enum):
1✔
26
    CSV = "text/csv"
1✔
27
    DOCX = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
1✔
28
    HTML = "text/html"
1✔
29
    JSON = "application/json"
1✔
30
    MD = "text/markdown"
1✔
31
    TEXT = "text/plain"
1✔
32
    PDF = "application/pdf"
1✔
33
    PPTX = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
1✔
34
    XLSX = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
1✔
35

36

37
@super_component
1✔
38
class MultiFileConverter:
1✔
39
    """
40
    A file converter that handles conversion of multiple file types.
41

42
    The MultiFileConverter handles the following file types:
43
    - CSV
44
    - DOCX
45
    - HTML
46
    - JSON
47
    - MD
48
    - TEXT
49
    - PDF (no OCR)
50
    - PPTX
51
    - XLSX
52

53
    Usage example:
54
    ```
55
    from haystack.super_components.converters import MultiFileConverter
56

57
    converter = MultiFileConverter()
58
    converter.run(sources=["test.txt", "test.pdf"], meta={})
59
    ```
60
    """
61

62
    def __init__(self, encoding: str = "utf-8", json_content_key: str = "content") -> None:
1✔
63
        """
64
        Initialize the MultiFileConverter.
65

66
        :param encoding: The encoding to use when reading files.
67
        :param json_content_key: The key to use in a content field in a document when converting JSON files.
68
        """
69
        self.encoding = encoding
1✔
70
        self.json_content_key = json_content_key
1✔
71

72
        # initialize components
73
        router = FileTypeRouter(
1✔
74
            mime_types=[mime_type.value for mime_type in ConverterMimeType],
75
            # Ensure common extensions are registered. Tests on Windows fail otherwise.
76
            additional_mimetypes={
77
                "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
78
                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
79
                "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
80
            },
81
        )
82

83
        # Create pipeline and add components
84
        pp = Pipeline()
1✔
85

86
        pp.add_component("router", router)
1✔
87
        pp.add_component("docx", DOCXToDocument(link_format="markdown"))
1✔
88
        pp.add_component(
1✔
89
            "html",
90
            HTMLToDocument(
91
                extraction_kwargs={"output_format": "markdown", "include_tables": True, "include_links": True}
92
            ),
93
        )
94
        pp.add_component("json", JSONConverter(content_key=self.json_content_key))
1✔
95
        pp.add_component("md", TextFileToDocument(encoding=self.encoding))
1✔
96
        pp.add_component("text", TextFileToDocument(encoding=self.encoding))
1✔
97
        pp.add_component("pdf", PyPDFToDocument())
1✔
98
        pp.add_component("pptx", PPTXToDocument())
1✔
99
        pp.add_component("xlsx", XLSXToDocument())
1✔
100
        pp.add_component("joiner", DocumentJoiner())
1✔
101
        pp.add_component("csv", CSVToDocument(encoding=self.encoding))
1✔
102

103
        for mime_type in ConverterMimeType:
1✔
104
            pp.connect(f"router.{mime_type.value}", str(mime_type).lower().rsplit(".", maxsplit=1)[-1])
1✔
105

106
        pp.connect("docx.documents", "joiner.documents")
1✔
107
        pp.connect("html.documents", "joiner.documents")
1✔
108
        pp.connect("json.documents", "joiner.documents")
1✔
109
        pp.connect("md.documents", "joiner.documents")
1✔
110
        pp.connect("text.documents", "joiner.documents")
1✔
111
        pp.connect("pdf.documents", "joiner.documents")
1✔
112
        pp.connect("pptx.documents", "joiner.documents")
1✔
113

114
        pp.connect("csv.documents", "joiner.documents")
1✔
115
        pp.connect("xlsx.documents", "joiner.documents")
1✔
116

117
        self.pipeline = pp
1✔
118
        self.output_mapping = {"joiner.documents": "documents", "router.unclassified": "unclassified"}
1✔
119
        self.input_mapping = {"sources": ["router.sources"], "meta": ["router.meta"]}
1✔
120

121
    if TYPE_CHECKING:
1✔
122
        # fake method, never executed, but static analyzers will not complain about missing method
123
        def run(  # noqa: D102
×
124
            self,
125
            *sources: list[Union[str, Path, ByteStream]],
126
            meta: Optional[Union[dict[str, Any], list[dict[str, Any]]]] = None,
127
        ) -> dict[str, list[Document]]:  # noqa: D102
128
            ...
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc