• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 11890937783

18 Nov 2024 10:54AM UTC coverage: 90.244% (-0.002%) from 90.246%
11890937783

push

github

web-flow
Fix `DocumentSplitter` not splitting by function (#8549)

* Fix DocumentSplitter not splitting by function

* Make the split_by mapping a constant

7844 of 8692 relevant lines covered (90.24%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.32
haystack/components/preprocessors/document_splitter.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
from copy import deepcopy
1✔
6
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
1✔
7

8
from more_itertools import windowed
1✔
9

10
from haystack import Document, component, logging
1✔
11
from haystack.core.serialization import default_from_dict, default_to_dict
1✔
12
from haystack.utils import deserialize_callable, serialize_callable
1✔
13

14
logger = logging.getLogger(__name__)
1✔
15

16
# Maps the 'split_by' argument to the actual char used to split the Documents.
17
# 'function' is not in the mapping cause it doesn't split on chars.
18
_SPLIT_BY_MAPPING = {"page": "\f", "passage": "\n\n", "sentence": ".", "word": " ", "line": "\n"}
1✔
19

20

21
@component
1✔
22
class DocumentSplitter:
1✔
23
    """
24
    Splits long documents into smaller chunks.
25

26
    This is a common preprocessing step during indexing.
27
    It helps Embedders create meaningful semantic representations
28
    and prevents exceeding language model context limits.
29

30
    The DocumentSplitter is compatible with the following DocumentStores:
31
    - [Astra](https://docs.haystack.deepset.ai/docs/astradocumentstore)
32
    - [Chroma](https://docs.haystack.deepset.ai/docs/chromadocumentstore) limited support, overlapping information is
33
      not stored
34
    - [Elasticsearch](https://docs.haystack.deepset.ai/docs/elasticsearch-document-store)
35
    - [OpenSearch](https://docs.haystack.deepset.ai/docs/opensearch-document-store)
36
    - [Pgvector](https://docs.haystack.deepset.ai/docs/pgvectordocumentstore)
37
    - [Pinecone](https://docs.haystack.deepset.ai/docs/pinecone-document-store) limited support, overlapping
38
       information is not stored
39
    - [Qdrant](https://docs.haystack.deepset.ai/docs/qdrant-document-store)
40
    - [Weaviate](https://docs.haystack.deepset.ai/docs/weaviatedocumentstore)
41

42
    ### Usage example
43

44
    ```python
45
    from haystack import Document
46
    from haystack.components.preprocessors import DocumentSplitter
47

48
    doc = Document(content="Moonlight shimmered softly, wolves howled nearby, night enveloped everything.")
49

50
    splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=0)
51
    result = splitter.run(documents=[doc])
52
    ```
53
    """
54

55
    def __init__(  # pylint: disable=too-many-positional-arguments
1✔
56
        self,
57
        split_by: Literal["function", "page", "passage", "sentence", "word", "line"] = "word",
58
        split_length: int = 200,
59
        split_overlap: int = 0,
60
        split_threshold: int = 0,
61
        splitting_function: Optional[Callable[[str], List[str]]] = None,
62
    ):
63
        """
64
        Initialize DocumentSplitter.
65

66
        :param split_by: The unit for splitting your documents. Choose from `word` for splitting by spaces (" "),
67
            `sentence` for splitting by periods ("."), `page` for splitting by form feed ("\\f"),
68
            `passage` for splitting by double line breaks ("\\n\\n") or `line` for splitting each line ("\\n").
69
        :param split_length: The maximum number of units in each split.
70
        :param split_overlap: The number of overlapping units for each split.
71
        :param split_threshold: The minimum number of units per split. If a split has fewer units
72
            than the threshold, it's attached to the previous split.
73
        :param splitting_function: Necessary when `split_by` is set to "function".
74
            This is a function which must accept a single `str` as input and return a `list` of `str` as output,
75
            representing the chunks after splitting.
76
        """
77

78
        self.split_by = split_by
1✔
79
        if split_by not in ["function", "page", "passage", "sentence", "word", "line"]:
1✔
80
            raise ValueError("split_by must be one of 'function', 'word', 'sentence', 'page', 'passage' or 'line'.")
1✔
81
        if split_by == "function" and splitting_function is None:
1✔
82
            raise ValueError("When 'split_by' is set to 'function', a valid 'splitting_function' must be provided.")
×
83
        if split_length <= 0:
1✔
84
            raise ValueError("split_length must be greater than 0.")
1✔
85
        self.split_length = split_length
1✔
86
        if split_overlap < 0:
1✔
87
            raise ValueError("split_overlap must be greater than or equal to 0.")
1✔
88
        self.split_overlap = split_overlap
1✔
89
        self.split_threshold = split_threshold
1✔
90
        self.splitting_function = splitting_function
1✔
91

92
    @component.output_types(documents=List[Document])
1✔
93
    def run(self, documents: List[Document]):
1✔
94
        """
95
        Split documents into smaller parts.
96

97
        Splits documents by the unit expressed in `split_by`, with a length of `split_length`
98
        and an overlap of `split_overlap`.
99

100
        :param documents: The documents to split.
101

102
        :returns: A dictionary with the following key:
103
            - `documents`: List of documents with the split texts. Each document includes:
104
                - A metadata field `source_id` to track the original document.
105
                - A metadata field `page_number` to track the original page number.
106
                - All other metadata copied from the original document.
107

108
        :raises TypeError: if the input is not a list of Documents.
109
        :raises ValueError: if the content of a document is None.
110
        """
111

112
        if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
1✔
113
            raise TypeError("DocumentSplitter expects a List of Documents as input.")
1✔
114

115
        split_docs: List[Document] = []
1✔
116
        for doc in documents:
1✔
117
            if doc.content is None:
1✔
118
                raise ValueError(
1✔
119
                    f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None."
120
                )
121
            if doc.content == "":
1✔
122
                logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id)
1✔
123
                continue
1✔
124
            split_docs += self._split(doc)
1✔
125
        return {"documents": split_docs}
1✔
126

127
    def _split(self, to_split: Document) -> List[Document]:
1✔
128
        # We already check this before calling _split but
129
        # we need to make linters happy
130
        if to_split.content is None:
1✔
131
            return []
×
132

133
        if self.split_by == "function" and self.splitting_function is not None:
1✔
134
            splits = self.splitting_function(to_split.content)
1✔
135
            docs: List[Document] = []
1✔
136
            for s in splits:
1✔
137
                meta = deepcopy(to_split.meta)
1✔
138
                meta["source_id"] = to_split.id
1✔
139
                docs.append(Document(content=s, meta=meta))
1✔
140
            return docs
1✔
141

142
        split_at = _SPLIT_BY_MAPPING[self.split_by]
1✔
143
        units = to_split.content.split(split_at)
1✔
144
        # Add the delimiter back to all units except the last one
145
        for i in range(len(units) - 1):
1✔
146
            units[i] += split_at
1✔
147

148
        text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
1✔
149
            units, self.split_length, self.split_overlap, self.split_threshold
150
        )
151
        metadata = deepcopy(to_split.meta)
1✔
152
        metadata["source_id"] = to_split.id
1✔
153
        return self._create_docs_from_splits(
1✔
154
            text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata
155
        )
156

157
    def _concatenate_units(
1✔
158
        self, elements: List[str], split_length: int, split_overlap: int, split_threshold: int
159
    ) -> Tuple[List[str], List[int], List[int]]:
160
        """
161
        Concatenates the elements into parts of split_length units.
162

163
        Keeps track of the original page number that each element belongs. If the length of the current units is less
164
        than the pre-defined `split_threshold`, it does not create a new split. Instead, it concatenates the current
165
        units with the last split, preventing the creation of excessively small splits.
166
        """
167

168
        text_splits: List[str] = []
1✔
169
        splits_pages: List[int] = []
1✔
170
        splits_start_idxs: List[int] = []
1✔
171
        cur_start_idx = 0
1✔
172
        cur_page = 1
1✔
173
        segments = windowed(elements, n=split_length, step=split_length - split_overlap)
1✔
174

175
        for seg in segments:
1✔
176
            current_units = [unit for unit in seg if unit is not None]
1✔
177
            txt = "".join(current_units)
1✔
178

179
            # check if length of current units is below split_threshold
180
            if len(current_units) < split_threshold and len(text_splits) > 0:
1✔
181
                # concatenate the last split with the current one
182
                text_splits[-1] += txt
1✔
183

184
            # NOTE: This line skips documents that have content=""
185
            elif len(txt) > 0:
1✔
186
                text_splits.append(txt)
1✔
187
                splits_pages.append(cur_page)
1✔
188
                splits_start_idxs.append(cur_start_idx)
1✔
189

190
            processed_units = current_units[: split_length - split_overlap]
1✔
191
            cur_start_idx += len("".join(processed_units))
1✔
192

193
            if self.split_by == "page":
1✔
194
                num_page_breaks = len(processed_units)
1✔
195
            else:
196
                num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
1✔
197

198
            cur_page += num_page_breaks
1✔
199

200
        return text_splits, splits_pages, splits_start_idxs
1✔
201

202
    def _create_docs_from_splits(
1✔
203
        self, text_splits: List[str], splits_pages: List[int], splits_start_idxs: List[int], meta: Dict[str, Any]
204
    ) -> List[Document]:
205
        """
206
        Creates Document objects from splits enriching them with page number and the metadata of the original document.
207
        """
208
        documents: List[Document] = []
1✔
209

210
        for i, (txt, split_idx) in enumerate(zip(text_splits, splits_start_idxs)):
1✔
211
            meta = deepcopy(meta)
1✔
212
            doc = Document(content=txt, meta=meta)
1✔
213
            doc.meta["page_number"] = splits_pages[i]
1✔
214
            doc.meta["split_id"] = i
1✔
215
            doc.meta["split_idx_start"] = split_idx
1✔
216
            documents.append(doc)
1✔
217

218
            if self.split_overlap <= 0:
1✔
219
                continue
1✔
220

221
            doc.meta["_split_overlap"] = []
1✔
222

223
            if i == 0:
1✔
224
                continue
1✔
225

226
            doc_start_idx = splits_start_idxs[i]
1✔
227
            previous_doc = documents[i - 1]
1✔
228
            previous_doc_start_idx = splits_start_idxs[i - 1]
1✔
229
            self._add_split_overlap_information(doc, doc_start_idx, previous_doc, previous_doc_start_idx)
1✔
230

231
        return documents
1✔
232

233
    @staticmethod
1✔
234
    def _add_split_overlap_information(
1✔
235
        current_doc: Document, current_doc_start_idx: int, previous_doc: Document, previous_doc_start_idx: int
236
    ):
237
        """
238
        Adds split overlap information to the current and previous Document's meta.
239

240
        :param current_doc: The Document that is being split.
241
        :param current_doc_start_idx: The starting index of the current Document.
242
        :param previous_doc: The Document that was split before the current Document.
243
        :param previous_doc_start_idx: The starting index of the previous Document.
244
        """
245
        overlapping_range = (current_doc_start_idx - previous_doc_start_idx, len(previous_doc.content))  # type: ignore
1✔
246

247
        if overlapping_range[0] < overlapping_range[1]:
1✔
248
            overlapping_str = previous_doc.content[overlapping_range[0] : overlapping_range[1]]  # type: ignore
1✔
249

250
            if current_doc.content.startswith(overlapping_str):  # type: ignore
1✔
251
                # add split overlap information to this Document regarding the previous Document
252
                current_doc.meta["_split_overlap"].append({"doc_id": previous_doc.id, "range": overlapping_range})
1✔
253

254
                # add split overlap information to previous Document regarding this Document
255
                overlapping_range = (0, overlapping_range[1] - overlapping_range[0])
1✔
256
                previous_doc.meta["_split_overlap"].append({"doc_id": current_doc.id, "range": overlapping_range})
1✔
257

258
    def to_dict(self) -> Dict[str, Any]:
1✔
259
        """
260
        Serializes the component to a dictionary.
261
        """
262
        serialized = default_to_dict(
1✔
263
            self,
264
            split_by=self.split_by,
265
            split_length=self.split_length,
266
            split_overlap=self.split_overlap,
267
            split_threshold=self.split_threshold,
268
        )
269
        if self.splitting_function:
1✔
270
            serialized["init_parameters"]["splitting_function"] = serialize_callable(self.splitting_function)
1✔
271
        return serialized
1✔
272

273
    @classmethod
1✔
274
    def from_dict(cls, data: Dict[str, Any]) -> "DocumentSplitter":
1✔
275
        """
276
        Deserializes the component from a dictionary.
277
        """
278
        init_params = data.get("init_parameters", {})
1✔
279

280
        splitting_function = init_params.get("splitting_function", None)
1✔
281
        if splitting_function:
1✔
282
            init_params["splitting_function"] = deserialize_callable(splitting_function)
1✔
283

284
        return default_from_dict(cls, data)
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc