8096865523

Committed 29 Feb 2024 01:31PM UTC coverage: 89.905% (-0.2%) from 90.144%

Build # 8096865523

Build Type

push

github

Committed by

web-flow

Commit Message

chore: enforce kwarg logging (#7207)

* chore: add logger which eases logging of extras

* chore: start migrating to key value

* fix: import fixes

* tests: temporarily comment out breaking test

* refactor: move to kwarg based logging

* style: fix import order

* chore: implement self-review comments

* test: drop failing test

* chore: fix more import orders

* docs: add changelog

* tests: fix broken tests

* chore: fix getting the frames

* chore: add comment

* chore: cleanup

* chore: adapt remaining `%s` usages

Run Details

5281 of 5874 relevant lines covered (89.9%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.81

haystack/components/preprocessors/document_cleaner.py

import re
from copy import deepcopy
from functools import partial, reduce
from itertools import chain
from typing import Generator, List, Optional, Set

from haystack import Document, component, logging

logger = logging.getLogger(__name__)


@component
class DocumentCleaner:
    """
    Cleans up text documents by removing extra whitespaces, empty lines, specified substrings, regexes,
    page headers and footers (in this order).

    Usage example:
    ```python
    from haystack import Document
    from haystack.components.preprocessors import DocumentCleaner

    doc = Document(content="This   is  a  document  to  clean\\n\\n\\nsubstring to remove")

    cleaner = DocumentCleaner(remove_substrings = ["substring to remove"])
    result = cleaner.run(documents=[doc])

    assert result["documents"][0].content == "This is a document to clean "
    ```
    """

    def __init__(
        self,
        remove_empty_lines: bool = True,
        remove_extra_whitespaces: bool = True,
        remove_repeated_substrings: bool = False,
        remove_substrings: Optional[List[str]] = None,
        remove_regex: Optional[str] = None,
    ):
        """
        :param remove_empty_lines: Whether to remove empty lines.
        :param remove_extra_whitespaces: Whether to remove extra whitespaces.
        :param remove_repeated_substrings: Whether to remove repeated substrings (headers/footers) from pages.
            Pages in the text need to be separated by form feed character "\\f",
            which is supported by `TextFileToDocument` and `AzureOCRDocumentConverter`.
        :param remove_substrings: List of substrings to remove from the text.
        :param remove_regex: Regex to match and replace substrings by "".
        """

        self.remove_empty_lines = remove_empty_lines
        self.remove_extra_whitespaces = remove_extra_whitespaces
        self.remove_repeated_substrings = remove_repeated_substrings
        self.remove_substrings = remove_substrings
        self.remove_regex = remove_regex

    @component.output_types(documents=List[Document])
    def run(self, documents: List[Document]):
        """
        Cleans up the documents.

        :param documents: List of Documents to clean.

        :returns: A dictionary with the following key:
            - `documents`: List of cleaned Documents.

        :raises TypeError: if documents is not a list of Documents.
        """
        if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
            raise TypeError("DocumentCleaner expects a List of Documents as input.")

        cleaned_docs = []
        for doc in documents:
            if doc.content is None:
                logger.warning(
                    "DocumentCleaner only cleans text documents but document.content for document ID %{document_id} is None.",
                    document_id=doc.id,
                )
                cleaned_docs.append(doc)
                continue
            text = doc.content

            if self.remove_extra_whitespaces:
                text = self._remove_extra_whitespaces(text)
            if self.remove_empty_lines:
                text = self._remove_empty_lines(text)
            if self.remove_substrings:
                text = self._remove_substrings(text, self.remove_substrings)
            if self.remove_regex:
                text = self._remove_regex(text, self.remove_regex)
            if self.remove_repeated_substrings:
                text = self._remove_repeated_substrings(text)

            cleaned_docs.append(Document(content=text, meta=deepcopy(doc.meta)))

        return {"documents": cleaned_docs}

    def _remove_empty_lines(self, text: str) -> str:
        """
        Remove empty lines and lines that contain nothing but whitespaces from text.
        :param text: Text to clean.
        :returns: The text without empty lines.
        """
        lines = text.split("\n")
        non_empty_lines = filter(lambda line: line.strip() != "", lines)
        return "\n".join(non_empty_lines)

    def _remove_extra_whitespaces(self, text: str) -> str:
        """
        Remove extra whitespaces from text.
        :param text: Text to clean.
        :returns: The text without extra whitespaces.
        """
        return re.sub(r"\s\s+", " ", text).strip()

    def _remove_regex(self, text: str, regex: str) -> str:
        """
        Remove substrings that match the specified regex from the text.
        :param text: Text to clean.
        :param regex: Regex to match and replace substrings by "".
        :returns: The text without the substrings that match the regex.
        """
        return re.sub(regex, "", text).strip()

    def _remove_substrings(self, text: str, substrings: List[str]) -> str:
        """
        Remove all specified substrings from the text.
        :param text: Text to clean.
        :param substrings: Substrings to remove.
        :returns: The text without the specified substrings.
        """
        for substring in substrings:
            text = text.replace(substring, "")
        return text

    def _remove_repeated_substrings(self, text: str) -> str:
        """
        Remove any substrings from the text that occur repeatedly on every page. For example headers or footers.
        Pages in the text need to be separated by form feed character "\f".
        :param text: Text to clean.
        :returns: The text without the repeated substrings.
        """
        return self._find_and_remove_header_footer(
            text, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
        )

    def _find_and_remove_header_footer(
        self, text: str, n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
    ) -> str:
        """
        Heuristic to find footers and headers across different pages by searching for the longest common string.
        Pages in the text need to be separated by form feed character "\f".
        For headers, we only search in the first n_chars characters (for footer: last n_chars).
        Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
         but won't detect "Page 3 of 4" or similar.

        :param n_chars: The number of first/last characters where the header/footer shall be searched in.
        :param n_first_pages_to_ignore: The number of first pages to ignore (e.g. TOCs often don't contain footer/header).
        :param n_last_pages_to_ignore: The number of last pages to ignore.
        :returns: The text without the found headers and footers.
        """

        pages = text.split("\f")

        # header
        start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
        found_header = self._find_longest_common_ngram(start_of_pages)
        if found_header:
            pages = [page.replace(found_header, "") for page in pages]

        # footer
        end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
        found_footer = self._find_longest_common_ngram(end_of_pages)
        if found_footer:
            pages = [page.replace(found_footer, "") for page in pages]

        logger.debug(
            "Removed header '{header}' and footer '{footer}' in document", header=found_header, footer=found_footer
        )
        text = "\f".join(pages)
        return text

    def _ngram(self, seq: str, n: int) -> Generator[str, None, None]:
        """
        Return all ngrams of length n from a text sequence. Each ngram consists of n words split by whitespace.
        :param seq: The sequence to generate ngrams from.
        :param n: The length of the ngrams to generate.
        :returns: A Generator generating all ngrams of length n from the given sequence.
        """

        # In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
        # we add a space here and remove it after creation of the ngrams again (see below)
        seq = seq.replace("\n", " \n")
        seq = seq.replace("\t", " \t")

        words = seq.split(" ")
        ngrams = (
            " ".join(words[i : i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(words) - n + 1)
        )

        return ngrams

    def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]:
        """
        Generates all possible ngrams from a given sequence of text.
        Considering all ngram lengths between the minimum and maximum length.

        :param seq: The sequence to generate ngrams from.
        :param min_ngram: The minimum length of ngram to consider.
        :param max_ngram: The maximum length of ngram to consider.
        :returns: A set of all ngrams from the given sequence.
        """
        lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
        ngrams = map(partial(self._ngram, seq), lengths)
        res = set(chain.from_iterable(ngrams))
        return res

    def _find_longest_common_ngram(self, sequences: List[str], min_ngram: int = 3, max_ngram: int = 30) -> str:
        """
        Find the longest common ngram across a list of text sequences (e.g. start of pages).
        Considering all ngram lengths between the minimum and maximum length. Helpful for finding footers, headers etc.
        Empty sequences are ignored.

        :param sequences: The list of strings that shall be searched for common n_grams.
        :param max_ngram: The maximum length of ngram to consider.
        :param min_ngram: The minimum length of ngram to consider.
        :returns: The longest ngram that all sequences have in common.
        """
        sequences = [s for s in sequences if s]  # filter empty sequences
        if not sequences:
            return ""
        seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences)
        intersection = reduce(set.intersection, seqs_ngrams)

        longest = max(intersection, key=len, default="")
        return longest if longest.strip() else ""

1	import re	1✔
2	from copy import deepcopy	1✔
3	from functools import partial, reduce	1✔
4	from itertools import chain	1✔
5	from typing import Generator, List, Optional, Set	1✔
6
7	from haystack import Document, component, logging	1✔
8
9	logger = logging.getLogger(__name__)	1✔
10
11
12	@component	1✔
13	class DocumentCleaner:	1✔
14	"""
15	Cleans up text documents by removing extra whitespaces, empty lines, specified substrings, regexes,
16	page headers and footers (in this order).
17
18	Usage example:
19	```python
20	from haystack import Document
21	from haystack.components.preprocessors import DocumentCleaner
22
23	doc = Document(content="This is a document to clean\\n\\n\\nsubstring to remove")
24
25	cleaner = DocumentCleaner(remove_substrings = ["substring to remove"])
26	result = cleaner.run(documents=[doc])
27
28	assert result["documents"][0].content == "This is a document to clean "
29	```
30	"""
31
32	def __init__(	1✔
33	self,
34	remove_empty_lines: bool = True,
35	remove_extra_whitespaces: bool = True,
36	remove_repeated_substrings: bool = False,
37	remove_substrings: Optional[List[str]] = None,
38	remove_regex: Optional[str] = None,
39	):
40	"""
41	:param remove_empty_lines: Whether to remove empty lines.
42	:param remove_extra_whitespaces: Whether to remove extra whitespaces.
43	:param remove_repeated_substrings: Whether to remove repeated substrings (headers/footers) from pages.
44	Pages in the text need to be separated by form feed character "\\f",
45	which is supported by `TextFileToDocument` and `AzureOCRDocumentConverter`.
46	:param remove_substrings: List of substrings to remove from the text.
47	:param remove_regex: Regex to match and replace substrings by "".
48	"""
49
50	self.remove_empty_lines = remove_empty_lines	1✔
51	self.remove_extra_whitespaces = remove_extra_whitespaces	1✔
52	self.remove_repeated_substrings = remove_repeated_substrings	1✔
53	self.remove_substrings = remove_substrings	1✔
54	self.remove_regex = remove_regex	1✔
55
56	@component.output_types(documents=List[Document])	1✔
57	def run(self, documents: List[Document]):	1✔
58	"""
59	Cleans up the documents.
60
61	:param documents: List of Documents to clean.
62
63	:returns: A dictionary with the following key:
64	- `documents`: List of cleaned Documents.
65
66	:raises TypeError: if documents is not a list of Documents.
67	"""
68	if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):	1✔
69	raise TypeError("DocumentCleaner expects a List of Documents as input.")	1✔
70
71	cleaned_docs = []	1✔
72	for doc in documents:	1✔
73	if doc.content is None:	1✔
74	logger.warning(	1✔
75	"DocumentCleaner only cleans text documents but document.content for document ID %{document_id} is None.",
76	document_id=doc.id,
77	)
78	cleaned_docs.append(doc)	1✔
79	continue	1✔
80	text = doc.content	1✔
81
82	if self.remove_extra_whitespaces:	1✔
83	text = self._remove_extra_whitespaces(text)	1✔
84	if self.remove_empty_lines:	1✔
85	text = self._remove_empty_lines(text)	1✔
86	if self.remove_substrings:	1✔
87	text = self._remove_substrings(text, self.remove_substrings)	1✔
88	if self.remove_regex:	1✔
89	text = self._remove_regex(text, self.remove_regex)	1✔
90	if self.remove_repeated_substrings:	1✔
91	text = self._remove_repeated_substrings(text)	1✔
92
93	cleaned_docs.append(Document(content=text, meta=deepcopy(doc.meta)))	1✔
94
95	return {"documents": cleaned_docs}	1✔
96
97	def _remove_empty_lines(self, text: str) -> str:	1✔
98	"""
99	Remove empty lines and lines that contain nothing but whitespaces from text.
100	:param text: Text to clean.
101	:returns: The text without empty lines.
102	"""
103	lines = text.split("\n")	1✔
104	non_empty_lines = filter(lambda line: line.strip() != "", lines)	1✔
105	return "\n".join(non_empty_lines)	1✔
106
107	def _remove_extra_whitespaces(self, text: str) -> str:	1✔
108	"""
109	Remove extra whitespaces from text.
110	:param text: Text to clean.
111	:returns: The text without extra whitespaces.
112	"""
113	return re.sub(r"\s\s+", " ", text).strip()	1✔
114
115	def _remove_regex(self, text: str, regex: str) -> str:	1✔
116	"""
117	Remove substrings that match the specified regex from the text.
118	:param text: Text to clean.
119	:param regex: Regex to match and replace substrings by "".
120	:returns: The text without the substrings that match the regex.
121	"""
122	return re.sub(regex, "", text).strip()	1✔
123
124	def _remove_substrings(self, text: str, substrings: List[str]) -> str:	1✔
125	"""
126	Remove all specified substrings from the text.
127	:param text: Text to clean.
128	:param substrings: Substrings to remove.
129	:returns: The text without the specified substrings.
130	"""
131	for substring in substrings:	1✔
132	text = text.replace(substring, "")	1✔
133	return text	1✔
134
135	def _remove_repeated_substrings(self, text: str) -> str:	1✔
136	"""
137	Remove any substrings from the text that occur repeatedly on every page. For example headers or footers.
138	Pages in the text need to be separated by form feed character "\f".
139	:param text: Text to clean.
140	:returns: The text without the repeated substrings.
141	"""
142	return self._find_and_remove_header_footer(	1✔
143	text, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
144	)
145
146	def _find_and_remove_header_footer(	1✔
147	self, text: str, n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
148	) -> str:
149	"""
150	Heuristic to find footers and headers across different pages by searching for the longest common string.
151	Pages in the text need to be separated by form feed character "\f".
152	For headers, we only search in the first n_chars characters (for footer: last n_chars).
153	Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
154	but won't detect "Page 3 of 4" or similar.
155
156	:param n_chars: The number of first/last characters where the header/footer shall be searched in.
157	:param n_first_pages_to_ignore: The number of first pages to ignore (e.g. TOCs often don't contain footer/header).
158	:param n_last_pages_to_ignore: The number of last pages to ignore.
159	:returns: The text without the found headers and footers.
160	"""
161
162	pages = text.split("\f")	1✔
163
164	# header
165	start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]	1✔
166	found_header = self._find_longest_common_ngram(start_of_pages)	1✔
167	if found_header:	1✔
168	pages = [page.replace(found_header, "") for page in pages]	1✔
169
170	# footer
171	end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]	1✔
172	found_footer = self._find_longest_common_ngram(end_of_pages)	1✔
173	if found_footer:	1✔
174	pages = [page.replace(found_footer, "") for page in pages]	1✔
175
176	logger.debug(	1✔
177	"Removed header '{header}' and footer '{footer}' in document", header=found_header, footer=found_footer
178	)
179	text = "\f".join(pages)	1✔
180	return text	1✔
181
182	def _ngram(self, seq: str, n: int) -> Generator[str, None, None]:	1✔
183	"""
184	Return all ngrams of length n from a text sequence. Each ngram consists of n words split by whitespace.
185	:param seq: The sequence to generate ngrams from.
186	:param n: The length of the ngrams to generate.
187	:returns: A Generator generating all ngrams of length n from the given sequence.
188	"""
189
190	# In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
191	# we add a space here and remove it after creation of the ngrams again (see below)
192	seq = seq.replace("\n", " \n")	1✔
193	seq = seq.replace("\t", " \t")	1✔
194
195	words = seq.split(" ")	1✔
196	ngrams = (	1✔
197	" ".join(words[i : i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(words) - n + 1)
198	)
199
200	return ngrams	1✔
201
202	def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]:	1✔
203	"""
204	Generates all possible ngrams from a given sequence of text.
205	Considering all ngram lengths between the minimum and maximum length.
206
207	:param seq: The sequence to generate ngrams from.
208	:param min_ngram: The minimum length of ngram to consider.
209	:param max_ngram: The maximum length of ngram to consider.
210	:returns: A set of all ngrams from the given sequence.
211	"""
212	lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))	1✔
213	ngrams = map(partial(self._ngram, seq), lengths)	1✔
214	res = set(chain.from_iterable(ngrams))	1✔
215	return res	1✔
216
217	def _find_longest_common_ngram(self, sequences: List[str], min_ngram: int = 3, max_ngram: int = 30) -> str:	1✔
218	"""
219	Find the longest common ngram across a list of text sequences (e.g. start of pages).
220	Considering all ngram lengths between the minimum and maximum length. Helpful for finding footers, headers etc.
221	Empty sequences are ignored.
222
223	:param sequences: The list of strings that shall be searched for common n_grams.
224	:param max_ngram: The maximum length of ngram to consider.
225	:param min_ngram: The minimum length of ngram to consider.
226	:returns: The longest ngram that all sequences have in common.
227	"""
228	sequences = [s for s in sequences if s] # filter empty sequences	1✔
229	if not sequences:	1✔
230	return ""	×
231	seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences)	1✔
232	intersection = reduce(set.intersection, seqs_ngrams)	1✔
233
234	longest = max(intersection, key=len, default="")	1✔
235	return longest if longest.strip() else ""	1✔

deepset-ai / haystack / 8096865523

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous