• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 5797828511

pending completion
5797828511

push

github

web-flow
feat: Improve performance and add default media support in FileTypeClassifier (#5083)

* feat: add media outgoing edge to FileTypeClassifier

* Add release note

* Update language

---------

Co-authored-by: Daniel Bichuetti <daniel.bichuetti@gmail.com>
Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
Co-authored-by: agnieszka-m <amarzec13@gmail.com>

10912 of 23244 relevant lines covered (46.95%)

2.6 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.1
haystack/nodes/file_classifier/file_type.py
1
import mimetypes
11✔
2
from typing import Any, Dict, List, Union, Optional
11✔
3

4
import logging
11✔
5
from pathlib import Path
11✔
6

7
from haystack.nodes.base import BaseComponent
11✔
8
from haystack.lazy_imports import LazyImport
11✔
9

10

11
logger = logging.getLogger(__name__)
11✔
12

13
with LazyImport() as magic_import:
11✔
14
    import magic
11✔
15

16

17
DEFAULT_TYPES = ["txt", "pdf", "md", "docx", "html", "media"]
11✔
18

19
DEFAULT_MEDIA_TYPES = ["mp3", "mp4", "mpeg", "m4a", "wav", "webm"]
11✔
20

21

22
class FileTypeClassifier(BaseComponent):
11✔
23
    """
24
    Route files in an Indexing Pipeline to corresponding file converters.
25
    """
26

27
    outgoing_edges = len(DEFAULT_TYPES)
11✔
28

29
    def __init__(self, supported_types: Optional[List[str]] = None, full_analysis: bool = False):
11✔
30
        """
31
        Node that sends out files on a different output edge depending on their extension.
32

33
        :param supported_types: The file types this node distinguishes. Optional.
34
            If you don't provide any value, the default is: `txt`, `pdf`, `md`, `docx`, and `html`.
35
            You can't use lists with duplicate elements.
36
        :param full_analysis: If True, the whole file is analyzed to determine the file type.
37
            If False, only the first 2049 bytes are analyzed.
38
        """
39
        self.full_analysis = full_analysis
1✔
40
        self._default_types = False
1✔
41
        if supported_types is None:
1✔
42
            self._default_types = True
1✔
43
            supported_types = DEFAULT_TYPES
1✔
44
        if len(set(supported_types)) != len(supported_types):
1✔
45
            duplicates = supported_types
1✔
46
            for item in set(supported_types):
1✔
47
                duplicates.remove(item)
1✔
48
            raise ValueError(f"supported_types can't contain duplicate values ({duplicates}).")
1✔
49

50
        super().__init__()
1✔
51

52
        self.supported_types = supported_types
1✔
53

54
    @classmethod
11✔
55
    def _calculate_outgoing_edges(cls, component_params: Dict[str, Any]) -> int:
11✔
56
        supported_types = component_params.get("supported_types", DEFAULT_TYPES)
1✔
57
        return len(supported_types)
1✔
58

59
    def _estimate_extension(self, file_path: Path) -> str:
11✔
60
        """
61
        Return the extension found based on the contents of the given file
62

63
        :param file_path: the path to extract the extension from
64
        """
65
        try:
1✔
66
            with open(file_path, "rb") as f:
1✔
67
                if self.full_analysis:
1✔
68
                    buffer = f.read()
×
69
                else:
70
                    buffer = f.read(2049)
1✔
71
                extension = magic.from_buffer(buffer, mime=True)
1✔
72
                real_extension = mimetypes.guess_extension(extension) or ""
1✔
73
                real_extension = real_extension.lstrip(".")
1✔
74
                if self._default_types and real_extension in DEFAULT_MEDIA_TYPES:
1✔
75
                    return "media"
×
76
                return real_extension or ""
1✔
77
        except (NameError, ImportError):
1✔
78
            logger.error(
1✔
79
                "The type of '%s' could not be guessed, probably because 'python-magic' is not installed. Ignoring this error."
80
                "Please make sure the necessary OS libraries are installed if you need this functionality ('python-magic' or 'python-magic-bin' on Windows).",
81
                file_path,
82
            )
83
            return ""
1✔
84

85
    def _get_extension(self, file_paths: List[Path]) -> str:
11✔
86
        """
87
        Return the extension found in the given list of files.
88
        Also makes sure that all files have the same extension.
89
        If this is not true, it throws an exception.
90

91
        :param file_paths: the paths to extract the extension from
92
        :return: a set of strings with all the extensions (without duplicates), the extension will be guessed if the file has none
93
        """
94
        extension = file_paths[0].suffix.lower().lstrip(".")
1✔
95

96
        if extension == "" or (self._default_types and extension in DEFAULT_MEDIA_TYPES):
1✔
97
            extension = self._estimate_extension(file_paths[0])
1✔
98

99
        for path in file_paths:
1✔
100
            path_suffix = path.suffix.lower().lstrip(".")
1✔
101
            if path_suffix == "" or (self._default_types and path_suffix in DEFAULT_MEDIA_TYPES):
1✔
102
                path_suffix = self._estimate_extension(path)
1✔
103
            if path_suffix != extension:
1✔
104
                raise ValueError("Multiple non-default file types are not allowed at once.")
1✔
105

106
        return extension
1✔
107

108
    def run(self, file_paths: Union[Path, List[Path], str, List[str], List[Union[Path, str]]]):  # type: ignore
11✔
109
        """
110
        Sends out files on a different output edge depending on their extension.
111

112
        :param file_paths: paths to route on different edges.
113
        """
114
        if not isinstance(file_paths, list):
1✔
115
            file_paths = [file_paths]
1✔
116

117
        paths = [Path(path) for path in file_paths]
1✔
118

119
        output = {"file_paths": paths}
1✔
120
        extension = self._get_extension(paths)
1✔
121
        try:
1✔
122
            index = self.supported_types.index(extension) + 1
1✔
123
        except ValueError:
1✔
124
            raise ValueError(
1✔
125
                f"Files of type '{extension}' ({paths[0]}) are not supported. "
126
                f"The supported types are: {self.supported_types}. "
127
                "Consider using the 'supported_types' parameter to "
128
                "change the types accepted by this node."
129
            )
130
        return output, f"output_{index}"
1✔
131

132
    def run_batch(self, file_paths: Union[Path, List[Path], str, List[str], List[Union[Path, str]]]):  # type: ignore
11✔
133
        return self.run(file_paths=file_paths)
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc