• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 14854285423

06 May 2025 07:42AM CUT coverage: 90.403% (-0.007%) from 90.41%
14854285423

Pull #9329

github

web-flow
Merge 45165840a into 64f384b52
Pull Request #9329: feat: add py.typed; adjust `Component` protocol

10908 of 12066 relevant lines covered (90.4%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.56
haystack/components/preprocessors/document_preprocessor.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
from typing import Any, Callable, Dict, List, Literal, Optional
1✔
6

7
from haystack import Pipeline, default_from_dict, default_to_dict, super_component
1✔
8
from haystack.components.preprocessors.document_cleaner import DocumentCleaner
1✔
9
from haystack.components.preprocessors.document_splitter import DocumentSplitter, Language
1✔
10
from haystack.utils import deserialize_callable, serialize_callable
1✔
11

12

13
@super_component
1✔
14
class DocumentPreprocessor:
1✔
15
    """
16
    A SuperComponent that first splits and then cleans documents.
17

18
    This component consists of a DocumentSplitter followed by a DocumentCleaner in a single pipeline.
19
    It takes a list of documents as input and returns a processed list of documents.
20

21
    Usage example:
22
    ```python
23
    from haystack import Document
24
    from haystack.components.preprocessors import DocumentPreprocessor
25

26
    doc = Document(content="I love pizza!")
27
    preprocessor = DocumentPreprocessor()
28
    result = preprocessor.run(documents=[doc])
29
    print(result["documents"])
30
    ```
31
    """
32

33
    def __init__(  # noqa: PLR0913 (too-many-arguments)
1✔
34
        self,
35
        *,
36
        # --- DocumentSplitter arguments ---
37
        split_by: Literal["function", "page", "passage", "period", "word", "line", "sentence"] = "word",
38
        split_length: int = 250,
39
        split_overlap: int = 0,
40
        split_threshold: int = 0,
41
        splitting_function: Optional[Callable[[str], List[str]]] = None,
42
        respect_sentence_boundary: bool = False,
43
        language: Language = "en",
44
        use_split_rules: bool = True,
45
        extend_abbreviations: bool = True,
46
        # --- DocumentCleaner arguments ---
47
        remove_empty_lines: bool = True,
48
        remove_extra_whitespaces: bool = True,
49
        remove_repeated_substrings: bool = False,
50
        keep_id: bool = False,
51
        remove_substrings: Optional[List[str]] = None,
52
        remove_regex: Optional[str] = None,
53
        unicode_normalization: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None,
54
        ascii_only: bool = False,
55
    ) -> None:
56
        """
57
        Initialize a DocumentPreProcessor that first splits and then cleans documents.
58

59
        **Splitter Parameters**:
60
        :param split_by: The unit of splitting: "function", "page", "passage", "period", "word", "line", or "sentence".
61
        :param split_length: The maximum number of units (words, lines, pages, and so on) in each split.
62
        :param split_overlap: The number of overlapping units between consecutive splits.
63
        :param split_threshold: The minimum number of units per split. If a split is smaller than this, it's merged
64
            with the previous split.
65
        :param splitting_function: A custom function for splitting if `split_by="function"`.
66
        :param respect_sentence_boundary: If `True`, splits by words but tries not to break inside a sentence.
67
        :param language: Language used by the sentence tokenizer if `split_by="sentence"` or
68
            `respect_sentence_boundary=True`.
69
        :param use_split_rules: Whether to apply additional splitting heuristics for the sentence splitter.
70
        :param extend_abbreviations: Whether to extend the sentence splitter with curated abbreviations for certain
71
            languages.
72

73
        **Cleaner Parameters**:
74
        :param remove_empty_lines: If `True`, removes empty lines.
75
        :param remove_extra_whitespaces: If `True`, removes extra whitespaces.
76
        :param remove_repeated_substrings: If `True`, removes repeated substrings like headers/footers across pages.
77
        :param keep_id: If `True`, keeps the original document IDs.
78
        :param remove_substrings: A list of strings to remove from the document content.
79
        :param remove_regex: A regex pattern whose matches will be removed from the document content.
80
        :param unicode_normalization: Unicode normalization form to apply to the text, for example `"NFC"`.
81
        :param ascii_only: If `True`, converts text to ASCII only.
82
        """
83
        # Store arguments for serialization
84
        self.remove_empty_lines = remove_empty_lines
1✔
85
        self.remove_extra_whitespaces = remove_extra_whitespaces
1✔
86
        self.remove_repeated_substrings = remove_repeated_substrings
1✔
87
        self.keep_id = keep_id
1✔
88
        self.remove_substrings = remove_substrings
1✔
89
        self.remove_regex = remove_regex
1✔
90
        self.unicode_normalization = unicode_normalization
1✔
91
        self.ascii_only = ascii_only
1✔
92

93
        self.split_by = split_by
1✔
94
        self.split_length = split_length
1✔
95
        self.split_overlap = split_overlap
1✔
96
        self.split_threshold = split_threshold
1✔
97
        self.splitting_function = splitting_function
1✔
98
        self.respect_sentence_boundary = respect_sentence_boundary
1✔
99
        self.language = language
1✔
100
        self.use_split_rules = use_split_rules
1✔
101
        self.extend_abbreviations = extend_abbreviations
1✔
102

103
        # Instantiate sub-components
104
        splitter = DocumentSplitter(
1✔
105
            split_by=self.split_by,
106
            split_length=self.split_length,
107
            split_overlap=self.split_overlap,
108
            split_threshold=self.split_threshold,
109
            splitting_function=self.splitting_function,
110
            respect_sentence_boundary=self.respect_sentence_boundary,
111
            language=self.language,
112
            use_split_rules=self.use_split_rules,
113
            extend_abbreviations=self.extend_abbreviations,
114
        )
115

116
        cleaner = DocumentCleaner(
1✔
117
            remove_empty_lines=self.remove_empty_lines,
118
            remove_extra_whitespaces=self.remove_extra_whitespaces,
119
            remove_repeated_substrings=self.remove_repeated_substrings,
120
            keep_id=self.keep_id,
121
            remove_substrings=self.remove_substrings,
122
            remove_regex=self.remove_regex,
123
            unicode_normalization=self.unicode_normalization,
124
            ascii_only=self.ascii_only,
125
        )
126

127
        # Build the Pipeline
128
        pp = Pipeline()
1✔
129

130
        pp.add_component("splitter", splitter)
1✔
131
        pp.add_component("cleaner", cleaner)
1✔
132

133
        # Connect the splitter output to cleaner
134
        pp.connect("splitter.documents", "cleaner.documents")
1✔
135
        self.pipeline = pp
1✔
136

137
        # Define how pipeline inputs/outputs map to sub-component inputs/outputs
138
        self.input_mapping = {
1✔
139
            # The pipeline input "documents" feeds into "splitter.documents"
140
            "documents": ["splitter.documents"]
141
        }
142
        # The pipeline output "documents" comes from "cleaner.documents"
143
        self.output_mapping = {"cleaner.documents": "documents"}
1✔
144

145
    def to_dict(self) -> Dict[str, Any]:
1✔
146
        """
147
        Serialize SuperComponent to a dictionary.
148

149
        :return:
150
            Dictionary with serialized data.
151
        """
152
        splitting_function = None
1✔
153
        if self.splitting_function is not None:
1✔
154
            splitting_function = serialize_callable(self.splitting_function)
×
155

156
        return default_to_dict(
1✔
157
            self,
158
            remove_empty_lines=self.remove_empty_lines,
159
            remove_extra_whitespaces=self.remove_extra_whitespaces,
160
            remove_repeated_substrings=self.remove_repeated_substrings,
161
            keep_id=self.keep_id,
162
            remove_substrings=self.remove_substrings,
163
            remove_regex=self.remove_regex,
164
            unicode_normalization=self.unicode_normalization,
165
            ascii_only=self.ascii_only,
166
            split_by=self.split_by,
167
            split_length=self.split_length,
168
            split_overlap=self.split_overlap,
169
            split_threshold=self.split_threshold,
170
            splitting_function=splitting_function,
171
            respect_sentence_boundary=self.respect_sentence_boundary,
172
            language=self.language,
173
            use_split_rules=self.use_split_rules,
174
            extend_abbreviations=self.extend_abbreviations,
175
        )
176

177
    @classmethod
1✔
178
    def from_dict(cls, data: Dict[str, Any]) -> "DocumentPreprocessor":
1✔
179
        """
180
        Deserializes the SuperComponent from a dictionary.
181

182
        :param data:
183
            Dictionary to deserialize from.
184
        :returns:
185
            Deserialized SuperComponent.
186
        """
187
        splitting_function = data["init_parameters"].get("splitting_function", None)
1✔
188
        if splitting_function:
1✔
189
            data["init_parameters"]["splitting_function"] = deserialize_callable(splitting_function)
×
190
        return default_from_dict(cls, data)
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc