• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 13972131258

20 Mar 2025 02:43PM UTC coverage: 90.021% (-0.03%) from 90.054%
13972131258

Pull #9069

github

web-flow
Merge 8371761b0 into 67ab3788e
Pull Request #9069: refactor!: `ChatMessage` serialization-deserialization updates

9833 of 10923 relevant lines covered (90.02%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.33
haystack/components/routers/file_type_router.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
import mimetypes
1✔
6
import re
1✔
7
from collections import defaultdict
1✔
8
from pathlib import Path
1✔
9
from typing import Any, Dict, List, Optional, Union
1✔
10

11
from haystack import component, default_from_dict, default_to_dict
1✔
12
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
1✔
13
from haystack.dataclasses import ByteStream
1✔
14

15
CUSTOM_MIMETYPES = {
1✔
16
    # we add markdown because it is not added by the mimetypes module
17
    # see https://github.com/python/cpython/pull/17995
18
    ".md": "text/markdown",
19
    ".markdown": "text/markdown",
20
    # we add msg because it is not added by the mimetypes module
21
    ".msg": "application/vnd.ms-outlook",
22
}
23

24

25
@component
1✔
26
class FileTypeRouter:
1✔
27
    """
28
    Categorizes files or byte streams by their MIME types, helping in context-based routing.
29

30
    FileTypeRouter supports both exact MIME type matching and regex patterns.
31

32
    For file paths, MIME types come from extensions, while byte streams use metadata.
33
    You can use regex patterns in the `mime_types` parameter to set broad categories
34
    (such as 'audio/*' or 'text/*') or specific types.
35
    MIME types without regex patterns are treated as exact matches.
36

37
    ### Usage example
38

39
    ```python
40
    from haystack.components.routers import FileTypeRouter
41
    from pathlib import Path
42

43
    # For exact MIME type matching
44
    router = FileTypeRouter(mime_types=["text/plain", "application/pdf"])
45

46
    # For flexible matching using regex, to handle all audio types
47
    router_with_regex = FileTypeRouter(mime_types=[r"audio/.*", r"text/plain"])
48

49
    sources = [Path("file.txt"), Path("document.pdf"), Path("song.mp3")]
50
    print(router.run(sources=sources))
51
    print(router_with_regex.run(sources=sources))
52

53
    # Expected output:
54
    # {'text/plain': [
55
    #   PosixPath('file.txt')], 'application/pdf': [PosixPath('document.pdf')], 'unclassified': [PosixPath('song.mp3')
56
    # ]}
57
    # {'audio/.*': [
58
    #   PosixPath('song.mp3')], 'text/plain': [PosixPath('file.txt')], 'unclassified': [PosixPath('document.pdf')
59
    # ]}
60
    ```
61
    """
62

63
    def __init__(self, mime_types: List[str], additional_mimetypes: Optional[Dict[str, str]] = None):
1✔
64
        """
65
        Initialize the FileTypeRouter component.
66

67
        :param mime_types:
68
            A list of MIME types or regex patterns to classify the input files or byte streams.
69
            (for example: `["text/plain", "audio/x-wav", "image/jpeg"]`).
70

71
        :param additional_mimetypes:
72
            A dictionary containing the MIME type to add to the mimetypes package to prevent unsupported or non native
73
            packages from being unclassified.
74
            (for example: `{"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx"}`).
75
        """
76
        if not mime_types:
1✔
77
            raise ValueError("The list of mime types cannot be empty.")
1✔
78

79
        if additional_mimetypes:
1✔
80
            for mime, ext in additional_mimetypes.items():
1✔
81
                mimetypes.add_type(mime, ext)
1✔
82

83
        self.mime_type_patterns = []
1✔
84
        for mime_type in mime_types:
1✔
85
            try:
1✔
86
                pattern = re.compile(mime_type)
1✔
87
            except re.error:
1✔
88
                raise ValueError(f"Invalid regex pattern '{mime_type}'.")
1✔
89
            self.mime_type_patterns.append(pattern)
1✔
90

91
        # the actual output type is List[Union[Path, ByteStream]],
92
        # but this would cause PipelineConnectError with Converters
93
        component.set_output_types(
1✔
94
            self,
95
            unclassified=List[Union[str, Path, ByteStream]],
96
            **dict.fromkeys(mime_types, List[Union[str, Path, ByteStream]]),
97
        )
98
        self.mime_types = mime_types
1✔
99
        self._additional_mimetypes = additional_mimetypes
1✔
100

101
    def to_dict(self) -> Dict[str, Any]:
1✔
102
        """
103
        Serializes the component to a dictionary.
104

105
        :returns:
106
            Dictionary with serialized data.
107
        """
108
        return default_to_dict(self, mime_types=self.mime_types, additional_mimetypes=self._additional_mimetypes)
1✔
109

110
    @classmethod
1✔
111
    def from_dict(cls, data: Dict[str, Any]) -> "FileTypeRouter":
1✔
112
        """
113
        Deserializes the component from a dictionary.
114

115
        :param data:
116
            The dictionary to deserialize from.
117
        :returns:
118
            The deserialized component.
119
        """
120
        return default_from_dict(cls, data)
1✔
121

122
    def run(
1✔
123
        self,
124
        sources: List[Union[str, Path, ByteStream]],
125
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
126
    ) -> Dict[str, List[Union[ByteStream, Path]]]:
127
        """
128
        Categorize files or byte streams according to their MIME types.
129

130
        :param sources:
131
            A list of file paths or byte streams to categorize.
132

133
        :param meta:
134
            Optional metadata to attach to the sources.
135
            When provided, the sources are internally converted to ByteStream objects and the metadata is added.
136
            This value can be a list of dictionaries or a single dictionary.
137
            If it's a single dictionary, its content is added to the metadata of all ByteStream objects.
138
            If it's a list, its length must match the number of sources, as they are zipped together.
139

140
        :returns: A dictionary where the keys are MIME types (or `"unclassified"`) and the values are lists of data
141
            sources.
142
        """
143

144
        mime_types = defaultdict(list)
1✔
145
        meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
1✔
146

147
        for source, meta_dict in zip(sources, meta_list):
1✔
148
            if isinstance(source, str):
1✔
149
                source = Path(source)
×
150

151
            if isinstance(source, Path):
1✔
152
                mime_type = self._get_mime_type(source)
1✔
153
            elif isinstance(source, ByteStream):
1✔
154
                mime_type = source.mime_type
1✔
155
            else:
156
                raise ValueError(f"Unsupported data source type: {type(source).__name__}")
1✔
157

158
            # If we have metadata, we convert the source to ByteStream and add the metadata
159
            if meta_dict:
1✔
160
                source = get_bytestream_from_source(source)
1✔
161
                source.meta.update(meta_dict)
1✔
162

163
            matched = False
1✔
164
            if mime_type:
1✔
165
                for pattern in self.mime_type_patterns:
1✔
166
                    if pattern.fullmatch(mime_type):
1✔
167
                        mime_types[pattern.pattern].append(source)
1✔
168
                        matched = True
1✔
169
                        break
1✔
170
            if not matched:
1✔
171
                mime_types["unclassified"].append(source)
1✔
172

173
        return dict(mime_types)
1✔
174

175
    def _get_mime_type(self, path: Path) -> Optional[str]:
1✔
176
        """
177
        Get the MIME type of the provided file path.
178

179
        :param path: The file path to get the MIME type for.
180

181
        :returns: The MIME type of the provided file path, or `None` if the MIME type cannot be determined.
182
        """
183
        extension = path.suffix.lower()
1✔
184
        mime_type = mimetypes.guess_type(path.as_posix())[0]
1✔
185
        # lookup custom mappings if the mime type is not found
186
        return CUSTOM_MIMETYPES.get(extension, mime_type)
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc