• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 8096865523

29 Feb 2024 01:31PM UTC coverage: 89.905% (-0.2%) from 90.144%
8096865523

push

github

web-flow
chore: enforce kwarg logging (#7207)

* chore: add logger which eases logging of extras

* chore: start migrating to key value

* fix: import fixes

* tests: temporarily comment out breaking test

* refactor: move to kwarg based logging

* style: fix import order

* chore: implement self-review comments

* test: drop failing test

* chore: fix more import orders

* docs: add changelog

* tests: fix broken tests

* chore: fix getting the frames

* chore: add comment

* chore: cleanup

* chore: adapt remaining `%s` usages

5281 of 5874 relevant lines covered (89.9%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.81
haystack/components/preprocessors/document_cleaner.py
1
import re
1✔
2
from copy import deepcopy
1✔
3
from functools import partial, reduce
1✔
4
from itertools import chain
1✔
5
from typing import Generator, List, Optional, Set
1✔
6

7
from haystack import Document, component, logging
1✔
8

9
logger = logging.getLogger(__name__)
1✔
10

11

12
@component
1✔
13
class DocumentCleaner:
1✔
14
    """
15
    Cleans up text documents by removing extra whitespaces, empty lines, specified substrings, regexes,
16
    page headers and footers (in this order).
17

18
    Usage example:
19
    ```python
20
    from haystack import Document
21
    from haystack.components.preprocessors import DocumentCleaner
22

23
    doc = Document(content="This   is  a  document  to  clean\\n\\n\\nsubstring to remove")
24

25
    cleaner = DocumentCleaner(remove_substrings = ["substring to remove"])
26
    result = cleaner.run(documents=[doc])
27

28
    assert result["documents"][0].content == "This is a document to clean "
29
    ```
30
    """
31

32
    def __init__(
1✔
33
        self,
34
        remove_empty_lines: bool = True,
35
        remove_extra_whitespaces: bool = True,
36
        remove_repeated_substrings: bool = False,
37
        remove_substrings: Optional[List[str]] = None,
38
        remove_regex: Optional[str] = None,
39
    ):
40
        """
41
        :param remove_empty_lines: Whether to remove empty lines.
42
        :param remove_extra_whitespaces: Whether to remove extra whitespaces.
43
        :param remove_repeated_substrings: Whether to remove repeated substrings (headers/footers) from pages.
44
            Pages in the text need to be separated by form feed character "\\f",
45
            which is supported by `TextFileToDocument` and `AzureOCRDocumentConverter`.
46
        :param remove_substrings: List of substrings to remove from the text.
47
        :param remove_regex: Regex to match and replace substrings by "".
48
        """
49

50
        self.remove_empty_lines = remove_empty_lines
1✔
51
        self.remove_extra_whitespaces = remove_extra_whitespaces
1✔
52
        self.remove_repeated_substrings = remove_repeated_substrings
1✔
53
        self.remove_substrings = remove_substrings
1✔
54
        self.remove_regex = remove_regex
1✔
55

56
    @component.output_types(documents=List[Document])
1✔
57
    def run(self, documents: List[Document]):
1✔
58
        """
59
        Cleans up the documents.
60

61
        :param documents: List of Documents to clean.
62

63
        :returns: A dictionary with the following key:
64
            - `documents`: List of cleaned Documents.
65

66
        :raises TypeError: if documents is not a list of Documents.
67
        """
68
        if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
1✔
69
            raise TypeError("DocumentCleaner expects a List of Documents as input.")
1✔
70

71
        cleaned_docs = []
1✔
72
        for doc in documents:
1✔
73
            if doc.content is None:
1✔
74
                logger.warning(
1✔
75
                    "DocumentCleaner only cleans text documents but document.content for document ID %{document_id} is None.",
76
                    document_id=doc.id,
77
                )
78
                cleaned_docs.append(doc)
1✔
79
                continue
1✔
80
            text = doc.content
1✔
81

82
            if self.remove_extra_whitespaces:
1✔
83
                text = self._remove_extra_whitespaces(text)
1✔
84
            if self.remove_empty_lines:
1✔
85
                text = self._remove_empty_lines(text)
1✔
86
            if self.remove_substrings:
1✔
87
                text = self._remove_substrings(text, self.remove_substrings)
1✔
88
            if self.remove_regex:
1✔
89
                text = self._remove_regex(text, self.remove_regex)
1✔
90
            if self.remove_repeated_substrings:
1✔
91
                text = self._remove_repeated_substrings(text)
1✔
92

93
            cleaned_docs.append(Document(content=text, meta=deepcopy(doc.meta)))
1✔
94

95
        return {"documents": cleaned_docs}
1✔
96

97
    def _remove_empty_lines(self, text: str) -> str:
1✔
98
        """
99
        Remove empty lines and lines that contain nothing but whitespaces from text.
100
        :param text: Text to clean.
101
        :returns: The text without empty lines.
102
        """
103
        lines = text.split("\n")
1✔
104
        non_empty_lines = filter(lambda line: line.strip() != "", lines)
1✔
105
        return "\n".join(non_empty_lines)
1✔
106

107
    def _remove_extra_whitespaces(self, text: str) -> str:
1✔
108
        """
109
        Remove extra whitespaces from text.
110
        :param text: Text to clean.
111
        :returns: The text without extra whitespaces.
112
        """
113
        return re.sub(r"\s\s+", " ", text).strip()
1✔
114

115
    def _remove_regex(self, text: str, regex: str) -> str:
1✔
116
        """
117
        Remove substrings that match the specified regex from the text.
118
        :param text: Text to clean.
119
        :param regex: Regex to match and replace substrings by "".
120
        :returns: The text without the substrings that match the regex.
121
        """
122
        return re.sub(regex, "", text).strip()
1✔
123

124
    def _remove_substrings(self, text: str, substrings: List[str]) -> str:
1✔
125
        """
126
        Remove all specified substrings from the text.
127
        :param text: Text to clean.
128
        :param substrings: Substrings to remove.
129
        :returns: The text without the specified substrings.
130
        """
131
        for substring in substrings:
1✔
132
            text = text.replace(substring, "")
1✔
133
        return text
1✔
134

135
    def _remove_repeated_substrings(self, text: str) -> str:
1✔
136
        """
137
        Remove any substrings from the text that occur repeatedly on every page. For example headers or footers.
138
        Pages in the text need to be separated by form feed character "\f".
139
        :param text: Text to clean.
140
        :returns: The text without the repeated substrings.
141
        """
142
        return self._find_and_remove_header_footer(
1✔
143
            text, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
144
        )
145

146
    def _find_and_remove_header_footer(
1✔
147
        self, text: str, n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
148
    ) -> str:
149
        """
150
        Heuristic to find footers and headers across different pages by searching for the longest common string.
151
        Pages in the text need to be separated by form feed character "\f".
152
        For headers, we only search in the first n_chars characters (for footer: last n_chars).
153
        Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
154
         but won't detect "Page 3 of 4" or similar.
155

156
        :param n_chars: The number of first/last characters where the header/footer shall be searched in.
157
        :param n_first_pages_to_ignore: The number of first pages to ignore (e.g. TOCs often don't contain footer/header).
158
        :param n_last_pages_to_ignore: The number of last pages to ignore.
159
        :returns: The text without the found headers and footers.
160
        """
161

162
        pages = text.split("\f")
1✔
163

164
        # header
165
        start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
1✔
166
        found_header = self._find_longest_common_ngram(start_of_pages)
1✔
167
        if found_header:
1✔
168
            pages = [page.replace(found_header, "") for page in pages]
1✔
169

170
        # footer
171
        end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
1✔
172
        found_footer = self._find_longest_common_ngram(end_of_pages)
1✔
173
        if found_footer:
1✔
174
            pages = [page.replace(found_footer, "") for page in pages]
1✔
175

176
        logger.debug(
1✔
177
            "Removed header '{header}' and footer '{footer}' in document", header=found_header, footer=found_footer
178
        )
179
        text = "\f".join(pages)
1✔
180
        return text
1✔
181

182
    def _ngram(self, seq: str, n: int) -> Generator[str, None, None]:
1✔
183
        """
184
        Return all ngrams of length n from a text sequence. Each ngram consists of n words split by whitespace.
185
        :param seq: The sequence to generate ngrams from.
186
        :param n: The length of the ngrams to generate.
187
        :returns: A Generator generating all ngrams of length n from the given sequence.
188
        """
189

190
        # In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
191
        # we add a space here and remove it after creation of the ngrams again (see below)
192
        seq = seq.replace("\n", " \n")
1✔
193
        seq = seq.replace("\t", " \t")
1✔
194

195
        words = seq.split(" ")
1✔
196
        ngrams = (
1✔
197
            " ".join(words[i : i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(words) - n + 1)
198
        )
199

200
        return ngrams
1✔
201

202
    def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]:
1✔
203
        """
204
        Generates all possible ngrams from a given sequence of text.
205
        Considering all ngram lengths between the minimum and maximum length.
206

207
        :param seq: The sequence to generate ngrams from.
208
        :param min_ngram: The minimum length of ngram to consider.
209
        :param max_ngram: The maximum length of ngram to consider.
210
        :returns: A set of all ngrams from the given sequence.
211
        """
212
        lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
1✔
213
        ngrams = map(partial(self._ngram, seq), lengths)
1✔
214
        res = set(chain.from_iterable(ngrams))
1✔
215
        return res
1✔
216

217
    def _find_longest_common_ngram(self, sequences: List[str], min_ngram: int = 3, max_ngram: int = 30) -> str:
1✔
218
        """
219
        Find the longest common ngram across a list of text sequences (e.g. start of pages).
220
        Considering all ngram lengths between the minimum and maximum length. Helpful for finding footers, headers etc.
221
        Empty sequences are ignored.
222

223
        :param sequences: The list of strings that shall be searched for common n_grams.
224
        :param max_ngram: The maximum length of ngram to consider.
225
        :param min_ngram: The minimum length of ngram to consider.
226
        :returns: The longest ngram that all sequences have in common.
227
        """
228
        sequences = [s for s in sequences if s]  # filter empty sequences
1✔
229
        if not sequences:
1✔
230
            return ""
×
231
        seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences)
1✔
232
        intersection = reduce(set.intersection, seqs_ngrams)
1✔
233

234
        longest = max(intersection, key=len, default="")
1✔
235
        return longest if longest.strip() else ""
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc