14854285423

Committed 06 May 2025 07:42AM CUT coverage: 90.403% (-0.007%) from 90.41%

Build # 14854285423

Build Type

Pull #9329

github

Committed by

web-flow

Commit Message

Merge 45165840a into 64f384b52

Pull Request Pull Request #9329: feat: add py.typed; adjust `Component` protocol

Run Details

10908 of 12066 relevant lines covered (90.4%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.56

haystack/components/preprocessors/document_preprocessor.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from typing import Any, Callable, Dict, List, Literal, Optional

from haystack import Pipeline, default_from_dict, default_to_dict, super_component
from haystack.components.preprocessors.document_cleaner import DocumentCleaner
from haystack.components.preprocessors.document_splitter import DocumentSplitter, Language
from haystack.utils import deserialize_callable, serialize_callable


@super_component
class DocumentPreprocessor:
    """
    A SuperComponent that first splits and then cleans documents.

    This component consists of a DocumentSplitter followed by a DocumentCleaner in a single pipeline.
    It takes a list of documents as input and returns a processed list of documents.

    Usage example:
    ```python
    from haystack import Document
    from haystack.components.preprocessors import DocumentPreprocessor

    doc = Document(content="I love pizza!")
    preprocessor = DocumentPreprocessor()
    result = preprocessor.run(documents=[doc])
    print(result["documents"])
    ```
    """

    def __init__(  # noqa: PLR0913 (too-many-arguments)
        self,
        *,
        # --- DocumentSplitter arguments ---
        split_by: Literal["function", "page", "passage", "period", "word", "line", "sentence"] = "word",
        split_length: int = 250,
        split_overlap: int = 0,
        split_threshold: int = 0,
        splitting_function: Optional[Callable[[str], List[str]]] = None,
        respect_sentence_boundary: bool = False,
        language: Language = "en",
        use_split_rules: bool = True,
        extend_abbreviations: bool = True,
        # --- DocumentCleaner arguments ---
        remove_empty_lines: bool = True,
        remove_extra_whitespaces: bool = True,
        remove_repeated_substrings: bool = False,
        keep_id: bool = False,
        remove_substrings: Optional[List[str]] = None,
        remove_regex: Optional[str] = None,
        unicode_normalization: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None,
        ascii_only: bool = False,
    ) -> None:
        """
        Initialize a DocumentPreProcessor that first splits and then cleans documents.

        **Splitter Parameters**:
        :param split_by: The unit of splitting: "function", "page", "passage", "period", "word", "line", or "sentence".
        :param split_length: The maximum number of units (words, lines, pages, and so on) in each split.
        :param split_overlap: The number of overlapping units between consecutive splits.
        :param split_threshold: The minimum number of units per split. If a split is smaller than this, it's merged
            with the previous split.
        :param splitting_function: A custom function for splitting if `split_by="function"`.
        :param respect_sentence_boundary: If `True`, splits by words but tries not to break inside a sentence.
        :param language: Language used by the sentence tokenizer if `split_by="sentence"` or
            `respect_sentence_boundary=True`.
        :param use_split_rules: Whether to apply additional splitting heuristics for the sentence splitter.
        :param extend_abbreviations: Whether to extend the sentence splitter with curated abbreviations for certain
            languages.

        **Cleaner Parameters**:
        :param remove_empty_lines: If `True`, removes empty lines.
        :param remove_extra_whitespaces: If `True`, removes extra whitespaces.
        :param remove_repeated_substrings: If `True`, removes repeated substrings like headers/footers across pages.
        :param keep_id: If `True`, keeps the original document IDs.
        :param remove_substrings: A list of strings to remove from the document content.
        :param remove_regex: A regex pattern whose matches will be removed from the document content.
        :param unicode_normalization: Unicode normalization form to apply to the text, for example `"NFC"`.
        :param ascii_only: If `True`, converts text to ASCII only.
        """
        # Store arguments for serialization
        self.remove_empty_lines = remove_empty_lines
        self.remove_extra_whitespaces = remove_extra_whitespaces
        self.remove_repeated_substrings = remove_repeated_substrings
        self.keep_id = keep_id
        self.remove_substrings = remove_substrings
        self.remove_regex = remove_regex
        self.unicode_normalization = unicode_normalization
        self.ascii_only = ascii_only

        self.split_by = split_by
        self.split_length = split_length
        self.split_overlap = split_overlap
        self.split_threshold = split_threshold
        self.splitting_function = splitting_function
        self.respect_sentence_boundary = respect_sentence_boundary
        self.language = language
        self.use_split_rules = use_split_rules
        self.extend_abbreviations = extend_abbreviations

        # Instantiate sub-components
        splitter = DocumentSplitter(
            split_by=self.split_by,
            split_length=self.split_length,
            split_overlap=self.split_overlap,
            split_threshold=self.split_threshold,
            splitting_function=self.splitting_function,
            respect_sentence_boundary=self.respect_sentence_boundary,
            language=self.language,
            use_split_rules=self.use_split_rules,
            extend_abbreviations=self.extend_abbreviations,
        )

        cleaner = DocumentCleaner(
            remove_empty_lines=self.remove_empty_lines,
            remove_extra_whitespaces=self.remove_extra_whitespaces,
            remove_repeated_substrings=self.remove_repeated_substrings,
            keep_id=self.keep_id,
            remove_substrings=self.remove_substrings,
            remove_regex=self.remove_regex,
            unicode_normalization=self.unicode_normalization,
            ascii_only=self.ascii_only,
        )

        # Build the Pipeline
        pp = Pipeline()

        pp.add_component("splitter", splitter)
        pp.add_component("cleaner", cleaner)

        # Connect the splitter output to cleaner
        pp.connect("splitter.documents", "cleaner.documents")
        self.pipeline = pp

        # Define how pipeline inputs/outputs map to sub-component inputs/outputs
        self.input_mapping = {
            # The pipeline input "documents" feeds into "splitter.documents"
            "documents": ["splitter.documents"]
        }
        # The pipeline output "documents" comes from "cleaner.documents"
        self.output_mapping = {"cleaner.documents": "documents"}

    def to_dict(self) -> Dict[str, Any]:
        """
        Serialize SuperComponent to a dictionary.

        :return:
            Dictionary with serialized data.
        """
        splitting_function = None
        if self.splitting_function is not None:
            splitting_function = serialize_callable(self.splitting_function)

        return default_to_dict(
            self,
            remove_empty_lines=self.remove_empty_lines,
            remove_extra_whitespaces=self.remove_extra_whitespaces,
            remove_repeated_substrings=self.remove_repeated_substrings,
            keep_id=self.keep_id,
            remove_substrings=self.remove_substrings,
            remove_regex=self.remove_regex,
            unicode_normalization=self.unicode_normalization,
            ascii_only=self.ascii_only,
            split_by=self.split_by,
            split_length=self.split_length,
            split_overlap=self.split_overlap,
            split_threshold=self.split_threshold,
            splitting_function=splitting_function,
            respect_sentence_boundary=self.respect_sentence_boundary,
            language=self.language,
            use_split_rules=self.use_split_rules,
            extend_abbreviations=self.extend_abbreviations,
        )

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "DocumentPreprocessor":
        """
        Deserializes the SuperComponent from a dictionary.

        :param data:
            Dictionary to deserialize from.
        :returns:
            Deserialized SuperComponent.
        """
        splitting_function = data["init_parameters"].get("splitting_function", None)
        if splitting_function:
            data["init_parameters"]["splitting_function"] = deserialize_callable(splitting_function)
        return default_from_dict(cls, data)

1	# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2	#
3	# SPDX-License-Identifier: Apache-2.0
4
5	from typing import Any, Callable, Dict, List, Literal, Optional	1✔
6
7	from haystack import Pipeline, default_from_dict, default_to_dict, super_component	1✔
8	from haystack.components.preprocessors.document_cleaner import DocumentCleaner	1✔
9	from haystack.components.preprocessors.document_splitter import DocumentSplitter, Language	1✔
10	from haystack.utils import deserialize_callable, serialize_callable	1✔
11
12
13	@super_component	1✔
14	class DocumentPreprocessor:	1✔
15	"""
16	A SuperComponent that first splits and then cleans documents.
17
18	This component consists of a DocumentSplitter followed by a DocumentCleaner in a single pipeline.
19	It takes a list of documents as input and returns a processed list of documents.
20
21	Usage example:
22	```python
23	from haystack import Document
24	from haystack.components.preprocessors import DocumentPreprocessor
25
26	doc = Document(content="I love pizza!")
27	preprocessor = DocumentPreprocessor()
28	result = preprocessor.run(documents=[doc])
29	print(result["documents"])
30	```
31	"""
32
33	def __init__( # noqa: PLR0913 (too-many-arguments)	1✔
34	self,
35	*,
36	# --- DocumentSplitter arguments ---
37	split_by: Literal["function", "page", "passage", "period", "word", "line", "sentence"] = "word",
38	split_length: int = 250,
39	split_overlap: int = 0,
40	split_threshold: int = 0,
41	splitting_function: Optional[Callable[[str], List[str]]] = None,
42	respect_sentence_boundary: bool = False,
43	language: Language = "en",
44	use_split_rules: bool = True,
45	extend_abbreviations: bool = True,
46	# --- DocumentCleaner arguments ---
47	remove_empty_lines: bool = True,
48	remove_extra_whitespaces: bool = True,
49	remove_repeated_substrings: bool = False,
50	keep_id: bool = False,
51	remove_substrings: Optional[List[str]] = None,
52	remove_regex: Optional[str] = None,
53	unicode_normalization: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None,
54	ascii_only: bool = False,
55	) -> None:
56	"""
57	Initialize a DocumentPreProcessor that first splits and then cleans documents.
58
59	Splitter Parameters:
60	:param split_by: The unit of splitting: "function", "page", "passage", "period", "word", "line", or "sentence".
61	:param split_length: The maximum number of units (words, lines, pages, and so on) in each split.
62	:param split_overlap: The number of overlapping units between consecutive splits.
63	:param split_threshold: The minimum number of units per split. If a split is smaller than this, it's merged
64	with the previous split.
65	:param splitting_function: A custom function for splitting if `split_by="function"`.
66	:param respect_sentence_boundary: If `True`, splits by words but tries not to break inside a sentence.
67	:param language: Language used by the sentence tokenizer if `split_by="sentence"` or
68	`respect_sentence_boundary=True`.
69	:param use_split_rules: Whether to apply additional splitting heuristics for the sentence splitter.
70	:param extend_abbreviations: Whether to extend the sentence splitter with curated abbreviations for certain
71	languages.
72
73	Cleaner Parameters:
74	:param remove_empty_lines: If `True`, removes empty lines.
75	:param remove_extra_whitespaces: If `True`, removes extra whitespaces.
76	:param remove_repeated_substrings: If `True`, removes repeated substrings like headers/footers across pages.
77	:param keep_id: If `True`, keeps the original document IDs.
78	:param remove_substrings: A list of strings to remove from the document content.
79	:param remove_regex: A regex pattern whose matches will be removed from the document content.
80	:param unicode_normalization: Unicode normalization form to apply to the text, for example `"NFC"`.
81	:param ascii_only: If `True`, converts text to ASCII only.
82	"""
83	# Store arguments for serialization
84	self.remove_empty_lines = remove_empty_lines	1✔
85	self.remove_extra_whitespaces = remove_extra_whitespaces	1✔
86	self.remove_repeated_substrings = remove_repeated_substrings	1✔
87	self.keep_id = keep_id	1✔
88	self.remove_substrings = remove_substrings	1✔
89	self.remove_regex = remove_regex	1✔
90	self.unicode_normalization = unicode_normalization	1✔
91	self.ascii_only = ascii_only	1✔
92
93	self.split_by = split_by	1✔
94	self.split_length = split_length	1✔
95	self.split_overlap = split_overlap	1✔
96	self.split_threshold = split_threshold	1✔
97	self.splitting_function = splitting_function	1✔
98	self.respect_sentence_boundary = respect_sentence_boundary	1✔
99	self.language = language	1✔
100	self.use_split_rules = use_split_rules	1✔
101	self.extend_abbreviations = extend_abbreviations	1✔
102
103	# Instantiate sub-components
104	splitter = DocumentSplitter(	1✔
105	split_by=self.split_by,
106	split_length=self.split_length,
107	split_overlap=self.split_overlap,
108	split_threshold=self.split_threshold,
109	splitting_function=self.splitting_function,
110	respect_sentence_boundary=self.respect_sentence_boundary,
111	language=self.language,
112	use_split_rules=self.use_split_rules,
113	extend_abbreviations=self.extend_abbreviations,
114	)
115
116	cleaner = DocumentCleaner(	1✔
117	remove_empty_lines=self.remove_empty_lines,
118	remove_extra_whitespaces=self.remove_extra_whitespaces,
119	remove_repeated_substrings=self.remove_repeated_substrings,
120	keep_id=self.keep_id,
121	remove_substrings=self.remove_substrings,
122	remove_regex=self.remove_regex,
123	unicode_normalization=self.unicode_normalization,
124	ascii_only=self.ascii_only,
125	)
126
127	# Build the Pipeline
128	pp = Pipeline()	1✔
129
130	pp.add_component("splitter", splitter)	1✔
131	pp.add_component("cleaner", cleaner)	1✔
132
133	# Connect the splitter output to cleaner
134	pp.connect("splitter.documents", "cleaner.documents")	1✔
135	self.pipeline = pp	1✔
136
137	# Define how pipeline inputs/outputs map to sub-component inputs/outputs
138	self.input_mapping = {	1✔
139	# The pipeline input "documents" feeds into "splitter.documents"
140	"documents": ["splitter.documents"]
141	}
142	# The pipeline output "documents" comes from "cleaner.documents"
143	self.output_mapping = {"cleaner.documents": "documents"}	1✔
144
145	def to_dict(self) -> Dict[str, Any]:	1✔
146	"""
147	Serialize SuperComponent to a dictionary.
148
149	:return:
150	Dictionary with serialized data.
151	"""
152	splitting_function = None	1✔
153	if self.splitting_function is not None:	1✔
154	splitting_function = serialize_callable(self.splitting_function)	×
155
156	return default_to_dict(	1✔
157	self,
158	remove_empty_lines=self.remove_empty_lines,
159	remove_extra_whitespaces=self.remove_extra_whitespaces,
160	remove_repeated_substrings=self.remove_repeated_substrings,
161	keep_id=self.keep_id,
162	remove_substrings=self.remove_substrings,
163	remove_regex=self.remove_regex,
164	unicode_normalization=self.unicode_normalization,
165	ascii_only=self.ascii_only,
166	split_by=self.split_by,
167	split_length=self.split_length,
168	split_overlap=self.split_overlap,
169	split_threshold=self.split_threshold,
170	splitting_function=splitting_function,
171	respect_sentence_boundary=self.respect_sentence_boundary,
172	language=self.language,
173	use_split_rules=self.use_split_rules,
174	extend_abbreviations=self.extend_abbreviations,
175	)
176
177	@classmethod	1✔
178	def from_dict(cls, data: Dict[str, Any]) -> "DocumentPreprocessor":	1✔
179	"""
180	Deserializes the SuperComponent from a dictionary.
181
182	:param data:
183	Dictionary to deserialize from.
184	:returns:
185	Deserialized SuperComponent.
186	"""
187	splitting_function = data["init_parameters"].get("splitting_function", None)	1✔
188	if splitting_function:	1✔
189	data["init_parameters"]["splitting_function"] = deserialize_callable(splitting_function)	×
190	return default_from_dict(cls, data)	1✔

deepset-ai / haystack / 14854285423

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous