• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 16937965780

13 Aug 2025 12:59PM UTC coverage: 92.122% (-0.06%) from 92.184%
16937965780

Pull #9710

github

web-flow
Merge e0256ac7f into c7256b211
Pull Request #9710: !fix: `FileTypeRouter` raising `FileNotFound` in a consistently

12887 of 13989 relevant lines covered (92.12%)

0.92 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

85.48
haystack/components/routers/file_type_router.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
import mimetypes
1✔
6
import re
1✔
7
import warnings
1✔
8
from collections import defaultdict
1✔
9
from pathlib import Path
1✔
10
from typing import Any, Optional, Union
1✔
11

12
from haystack import component, default_from_dict, default_to_dict
1✔
13
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
1✔
14
from haystack.dataclasses import ByteStream
1✔
15

16
from haystack.utils.misc import _guess_mime_type  # ruff: isort: skip
1✔
17

18
# We import CUSTOM_MIMETYPES here to prevent breaking change from moving to haystack.utils.misc
19
from haystack.utils.misc import CUSTOM_MIMETYPES  # pylint: disable=unused-import
1✔
20

21

22
@component
1✔
23
class FileTypeRouter:
1✔
24
    """
25
    Categorizes files or byte streams by their MIME types, helping in context-based routing.
26

27
    FileTypeRouter supports both exact MIME type matching and regex patterns.
28

29
    For file paths, MIME types come from extensions, while byte streams use metadata.
30
    You can use regex patterns in the `mime_types` parameter to set broad categories
31
    (such as 'audio/*' or 'text/*') or specific types.
32
    MIME types without regex patterns are treated as exact matches.
33

34
    ### Usage example
35

36
    ```python
37
    from haystack.components.routers import FileTypeRouter
38
    from pathlib import Path
39

40
    # For exact MIME type matching
41
    router = FileTypeRouter(mime_types=["text/plain", "application/pdf"])
42

43
    # For flexible matching using regex, to handle all audio types
44
    router_with_regex = FileTypeRouter(mime_types=[r"audio/.*", r"text/plain"])
45

46
    sources = [Path("file.txt"), Path("document.pdf"), Path("song.mp3")]
47
    print(router.run(sources=sources))
48
    print(router_with_regex.run(sources=sources))
49

50
    # Expected output:
51
    # {'text/plain': [
52
    #   PosixPath('file.txt')], 'application/pdf': [PosixPath('document.pdf')], 'unclassified': [PosixPath('song.mp3')
53
    # ]}
54
    # {'audio/.*': [
55
    #   PosixPath('song.mp3')], 'text/plain': [PosixPath('file.txt')], 'unclassified': [PosixPath('document.pdf')
56
    # ]}
57
    ```
58
    """
59

60
    def __init__(
1✔
61
        self,
62
        mime_types: list[str],
63
        additional_mimetypes: Optional[dict[str, str]] = None,
64
        raise_on_failure: bool = False,  # Set to True in 2.18 release and remove warning below
65
    ):
66
        """
67
        Initialize the FileTypeRouter component.
68

69
        :param mime_types:
70
            A list of MIME types or regex patterns to classify the input files or byte streams.
71
            (for example: `["text/plain", "audio/x-wav", "image/jpeg"]`).
72

73
        :param additional_mimetypes:
74
            A dictionary containing the MIME type to add to the mimetypes package to prevent unsupported or non-native
75
            packages from being unclassified.
76
            (for example: `{"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx"}`).
77

78
        :param raise_on_failure:
79
            If True, raises FileNotFoundError when a file path doesn't exist, regardless of whether metadata is
80
            provided.
81
            If False (default), only raises FileNotFoundError when metadata is provided (current behavior).
82
            This parameter will be removed in a future release where consistent behavior will be enforced.
83
        """
84
        if not mime_types:
1✔
85
            raise ValueError("The list of mime types cannot be empty.")
×
86

87
        # Deprecation warning for inconsistent FileNotFoundError behavior
88
        warnings.warn(
1✔
89
            "FileTypeRouter currently has inconsistent behavior: FileNotFoundError is only raised when "
90
            "metadata is provided. "
91
            "This will be changed in a future release to always raise FileNotFoundError for non-existent files. "
92
            "Set raise_on_failure=True to opt into the future behavior.",
93
            DeprecationWarning,
94
            stacklevel=2,
95
        )
96

97
        if additional_mimetypes:
1✔
98
            for mime, ext in additional_mimetypes.items():
1✔
99
                mimetypes.add_type(mime, ext)
1✔
100

101
        self.mime_type_patterns = []
1✔
102
        for mime_type in mime_types:
1✔
103
            try:
1✔
104
                pattern = re.compile(mime_type)
1✔
105
            except re.error:
×
106
                raise ValueError(f"Invalid regex pattern '{mime_type}'.")
×
107
            self.mime_type_patterns.append(pattern)
1✔
108

109
        # the actual output type is list[Union[Path, ByteStream]],
110
        # but this would cause PipelineConnectError with Converters
111
        component.set_output_types(
1✔
112
            self,
113
            unclassified=list[Union[str, Path, ByteStream]],
114
            **dict.fromkeys(mime_types, list[Union[str, Path, ByteStream]]),
115
        )
116
        self.mime_types = mime_types
1✔
117
        self._additional_mimetypes = additional_mimetypes
1✔
118
        self._raise_on_failure = raise_on_failure
1✔
119

120
    def to_dict(self) -> dict[str, Any]:
1✔
121
        """
122
        Serializes the component to a dictionary.
123

124
        :returns:
125
            Dictionary with serialized data.
126
        """
127
        return default_to_dict(
×
128
            self,
129
            mime_types=self.mime_types,
130
            additional_mimetypes=self._additional_mimetypes,
131
            raise_on_failure=self._raise_on_failure,
132
        )
133

134
    @classmethod
1✔
135
    def from_dict(cls, data: dict[str, Any]) -> "FileTypeRouter":
1✔
136
        """
137
        Deserializes the component from a dictionary.
138

139
        :param data:
140
            The dictionary to deserialize from.
141
        :returns:
142
            The deserialized component.
143
        """
144
        return default_from_dict(cls, data)
×
145

146
    def run(
1✔
147
        self,
148
        sources: list[Union[str, Path, ByteStream]],
149
        meta: Optional[Union[dict[str, Any], list[dict[str, Any]]]] = None,
150
    ) -> dict[str, list[Union[ByteStream, Path]]]:
151
        """
152
        Categorize files or byte streams according to their MIME types.
153

154
        :param sources:
155
            A list of file paths or byte streams to categorize.
156

157
        :param meta:
158
            Optional metadata to attach to the sources.
159
            When provided, the sources are internally converted to ByteStream objects and the metadata is added.
160
            This value can be a list of dictionaries or a single dictionary.
161
            If it's a single dictionary, its content is added to the metadata of all ByteStream objects.
162
            If it's a list, its length must match the number of sources, as they are zipped together.
163

164
        :returns: A dictionary where the keys are MIME types (or `"unclassified"`) and the values are lists of data
165
            sources.
166
        """
167

168
        mime_types = defaultdict(list)
1✔
169
        meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
1✔
170

171
        for source, meta_dict in zip(sources, meta_list):
1✔
172
            if isinstance(source, str):
1✔
173
                source = Path(source)
×
174

175
            if isinstance(source, Path):
1✔
176
                # Check if file exists when raise_on_failure is True
177
                if self._raise_on_failure and not source.exists():
1✔
178
                    raise FileNotFoundError(f"File not found: {source}")
×
179

180
                mime_type = _guess_mime_type(source)
1✔
181
            elif isinstance(source, ByteStream):
1✔
182
                mime_type = source.mime_type
1✔
183
            else:
184
                raise ValueError(f"Unsupported data source type: {type(source).__name__}")
×
185

186
            # If we have metadata, we convert the source to ByteStream and add the metadata
187
            if meta_dict:
1✔
188
                source = get_bytestream_from_source(source)
1✔
189
                source.meta.update(meta_dict)
1✔
190

191
            matched = False
1✔
192
            if mime_type:
1✔
193
                for pattern in self.mime_type_patterns:
1✔
194
                    if pattern.fullmatch(mime_type):
1✔
195
                        mime_types[pattern.pattern].append(source)
1✔
196
                        matched = True
1✔
197
                        break
1✔
198
            if not matched:
1✔
199
                mime_types["unclassified"].append(source)
×
200

201
        return dict(mime_types)
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc