11890937783

Committed 18 Nov 2024 10:54AM UTC coverage: 90.244% (-0.002%) from 90.246%

Build # 11890937783

Build Type

push

github

Committed by

web-flow

Commit Message

Fix `DocumentSplitter` not splitting by function (#8549)

* Fix DocumentSplitter not splitting by function

* Make the split_by mapping a constant

Run Details

7844 of 8692 relevant lines covered (90.24%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.32

haystack/components/preprocessors/document_splitter.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from copy import deepcopy
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple

from more_itertools import windowed

from haystack import Document, component, logging
from haystack.core.serialization import default_from_dict, default_to_dict
from haystack.utils import deserialize_callable, serialize_callable

logger = logging.getLogger(__name__)

# Maps the 'split_by' argument to the actual char used to split the Documents.
# 'function' is not in the mapping cause it doesn't split on chars.
_SPLIT_BY_MAPPING = {"page": "\f", "passage": "\n\n", "sentence": ".", "word": " ", "line": "\n"}


@component
class DocumentSplitter:
    """
    Splits long documents into smaller chunks.

    This is a common preprocessing step during indexing.
    It helps Embedders create meaningful semantic representations
    and prevents exceeding language model context limits.

    The DocumentSplitter is compatible with the following DocumentStores:
    - [Astra](https://docs.haystack.deepset.ai/docs/astradocumentstore)
    - [Chroma](https://docs.haystack.deepset.ai/docs/chromadocumentstore) limited support, overlapping information is
      not stored
    - [Elasticsearch](https://docs.haystack.deepset.ai/docs/elasticsearch-document-store)
    - [OpenSearch](https://docs.haystack.deepset.ai/docs/opensearch-document-store)
    - [Pgvector](https://docs.haystack.deepset.ai/docs/pgvectordocumentstore)
    - [Pinecone](https://docs.haystack.deepset.ai/docs/pinecone-document-store) limited support, overlapping
       information is not stored
    - [Qdrant](https://docs.haystack.deepset.ai/docs/qdrant-document-store)
    - [Weaviate](https://docs.haystack.deepset.ai/docs/weaviatedocumentstore)

    ### Usage example

    ```python
    from haystack import Document
    from haystack.components.preprocessors import DocumentSplitter

    doc = Document(content="Moonlight shimmered softly, wolves howled nearby, night enveloped everything.")

    splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=0)
    result = splitter.run(documents=[doc])
    ```
    """

    def __init__(  # pylint: disable=too-many-positional-arguments
        self,
        split_by: Literal["function", "page", "passage", "sentence", "word", "line"] = "word",
        split_length: int = 200,
        split_overlap: int = 0,
        split_threshold: int = 0,
        splitting_function: Optional[Callable[[str], List[str]]] = None,
    ):
        """
        Initialize DocumentSplitter.

        :param split_by: The unit for splitting your documents. Choose from `word` for splitting by spaces (" "),
            `sentence` for splitting by periods ("."), `page` for splitting by form feed ("\\f"),
            `passage` for splitting by double line breaks ("\\n\\n") or `line` for splitting each line ("\\n").
        :param split_length: The maximum number of units in each split.
        :param split_overlap: The number of overlapping units for each split.
        :param split_threshold: The minimum number of units per split. If a split has fewer units
            than the threshold, it's attached to the previous split.
        :param splitting_function: Necessary when `split_by` is set to "function".
            This is a function which must accept a single `str` as input and return a `list` of `str` as output,
            representing the chunks after splitting.
        """

        self.split_by = split_by
        if split_by not in ["function", "page", "passage", "sentence", "word", "line"]:
            raise ValueError("split_by must be one of 'function', 'word', 'sentence', 'page', 'passage' or 'line'.")
        if split_by == "function" and splitting_function is None:
            raise ValueError("When 'split_by' is set to 'function', a valid 'splitting_function' must be provided.")
        if split_length <= 0:
            raise ValueError("split_length must be greater than 0.")
        self.split_length = split_length
        if split_overlap < 0:
            raise ValueError("split_overlap must be greater than or equal to 0.")
        self.split_overlap = split_overlap
        self.split_threshold = split_threshold
        self.splitting_function = splitting_function

    @component.output_types(documents=List[Document])
    def run(self, documents: List[Document]):
        """
        Split documents into smaller parts.

        Splits documents by the unit expressed in `split_by`, with a length of `split_length`
        and an overlap of `split_overlap`.

        :param documents: The documents to split.

        :returns: A dictionary with the following key:
            - `documents`: List of documents with the split texts. Each document includes:
                - A metadata field `source_id` to track the original document.
                - A metadata field `page_number` to track the original page number.
                - All other metadata copied from the original document.

        :raises TypeError: if the input is not a list of Documents.
        :raises ValueError: if the content of a document is None.
        """

        if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
            raise TypeError("DocumentSplitter expects a List of Documents as input.")

        split_docs: List[Document] = []
        for doc in documents:
            if doc.content is None:
                raise ValueError(
                    f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None."
                )
            if doc.content == "":
                logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id)
                continue
            split_docs += self._split(doc)
        return {"documents": split_docs}

    def _split(self, to_split: Document) -> List[Document]:
        # We already check this before calling _split but
        # we need to make linters happy
        if to_split.content is None:
            return []

        if self.split_by == "function" and self.splitting_function is not None:
            splits = self.splitting_function(to_split.content)
            docs: List[Document] = []
            for s in splits:
                meta = deepcopy(to_split.meta)
                meta["source_id"] = to_split.id
                docs.append(Document(content=s, meta=meta))
            return docs

        split_at = _SPLIT_BY_MAPPING[self.split_by]
        units = to_split.content.split(split_at)
        # Add the delimiter back to all units except the last one
        for i in range(len(units) - 1):
            units[i] += split_at

        text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
            units, self.split_length, self.split_overlap, self.split_threshold
        )
        metadata = deepcopy(to_split.meta)
        metadata["source_id"] = to_split.id
        return self._create_docs_from_splits(
            text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata
        )

    def _concatenate_units(
        self, elements: List[str], split_length: int, split_overlap: int, split_threshold: int
    ) -> Tuple[List[str], List[int], List[int]]:
        """
        Concatenates the elements into parts of split_length units.

        Keeps track of the original page number that each element belongs. If the length of the current units is less
        than the pre-defined `split_threshold`, it does not create a new split. Instead, it concatenates the current
        units with the last split, preventing the creation of excessively small splits.
        """

        text_splits: List[str] = []
        splits_pages: List[int] = []
        splits_start_idxs: List[int] = []
        cur_start_idx = 0
        cur_page = 1
        segments = windowed(elements, n=split_length, step=split_length - split_overlap)

        for seg in segments:
            current_units = [unit for unit in seg if unit is not None]
            txt = "".join(current_units)

            # check if length of current units is below split_threshold
            if len(current_units) < split_threshold and len(text_splits) > 0:
                # concatenate the last split with the current one
                text_splits[-1] += txt

            # NOTE: This line skips documents that have content=""
            elif len(txt) > 0:
                text_splits.append(txt)
                splits_pages.append(cur_page)
                splits_start_idxs.append(cur_start_idx)

            processed_units = current_units[: split_length - split_overlap]
            cur_start_idx += len("".join(processed_units))

            if self.split_by == "page":
                num_page_breaks = len(processed_units)
            else:
                num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)

            cur_page += num_page_breaks

        return text_splits, splits_pages, splits_start_idxs

    def _create_docs_from_splits(
        self, text_splits: List[str], splits_pages: List[int], splits_start_idxs: List[int], meta: Dict[str, Any]
    ) -> List[Document]:
        """
        Creates Document objects from splits enriching them with page number and the metadata of the original document.
        """
        documents: List[Document] = []

        for i, (txt, split_idx) in enumerate(zip(text_splits, splits_start_idxs)):
            meta = deepcopy(meta)
            doc = Document(content=txt, meta=meta)
            doc.meta["page_number"] = splits_pages[i]
            doc.meta["split_id"] = i
            doc.meta["split_idx_start"] = split_idx
            documents.append(doc)

            if self.split_overlap <= 0:
                continue

            doc.meta["_split_overlap"] = []

            if i == 0:
                continue

            doc_start_idx = splits_start_idxs[i]
            previous_doc = documents[i - 1]
            previous_doc_start_idx = splits_start_idxs[i - 1]
            self._add_split_overlap_information(doc, doc_start_idx, previous_doc, previous_doc_start_idx)

        return documents

    @staticmethod
    def _add_split_overlap_information(
        current_doc: Document, current_doc_start_idx: int, previous_doc: Document, previous_doc_start_idx: int
    ):
        """
        Adds split overlap information to the current and previous Document's meta.

        :param current_doc: The Document that is being split.
        :param current_doc_start_idx: The starting index of the current Document.
        :param previous_doc: The Document that was split before the current Document.
        :param previous_doc_start_idx: The starting index of the previous Document.
        """
        overlapping_range = (current_doc_start_idx - previous_doc_start_idx, len(previous_doc.content))  # type: ignore

        if overlapping_range[0] < overlapping_range[1]:
            overlapping_str = previous_doc.content[overlapping_range[0] : overlapping_range[1]]  # type: ignore

            if current_doc.content.startswith(overlapping_str):  # type: ignore
                # add split overlap information to this Document regarding the previous Document
                current_doc.meta["_split_overlap"].append({"doc_id": previous_doc.id, "range": overlapping_range})

                # add split overlap information to previous Document regarding this Document
                overlapping_range = (0, overlapping_range[1] - overlapping_range[0])
                previous_doc.meta["_split_overlap"].append({"doc_id": current_doc.id, "range": overlapping_range})

    def to_dict(self) -> Dict[str, Any]:
        """
        Serializes the component to a dictionary.
        """
        serialized = default_to_dict(
            self,
            split_by=self.split_by,
            split_length=self.split_length,
            split_overlap=self.split_overlap,
            split_threshold=self.split_threshold,
        )
        if self.splitting_function:
            serialized["init_parameters"]["splitting_function"] = serialize_callable(self.splitting_function)
        return serialized

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "DocumentSplitter":
        """
        Deserializes the component from a dictionary.
        """
        init_params = data.get("init_parameters", {})

        splitting_function = init_params.get("splitting_function", None)
        if splitting_function:
            init_params["splitting_function"] = deserialize_callable(splitting_function)

        return default_from_dict(cls, data)

1	# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2	#
3	# SPDX-License-Identifier: Apache-2.0
4
5	from copy import deepcopy	1✔
6	from typing import Any, Callable, Dict, List, Literal, Optional, Tuple	1✔
7
8	from more_itertools import windowed	1✔
9
10	from haystack import Document, component, logging	1✔
11	from haystack.core.serialization import default_from_dict, default_to_dict	1✔
12	from haystack.utils import deserialize_callable, serialize_callable	1✔
13
14	logger = logging.getLogger(__name__)	1✔
15
16	# Maps the 'split_by' argument to the actual char used to split the Documents.
17	# 'function' is not in the mapping cause it doesn't split on chars.
18	_SPLIT_BY_MAPPING = {"page": "\f", "passage": "\n\n", "sentence": ".", "word": " ", "line": "\n"}	1✔
19
20
21	@component	1✔
22	class DocumentSplitter:	1✔
23	"""
24	Splits long documents into smaller chunks.
25
26	This is a common preprocessing step during indexing.
27	It helps Embedders create meaningful semantic representations
28	and prevents exceeding language model context limits.
29
30	The DocumentSplitter is compatible with the following DocumentStores:
31	- [Astra](https://docs.haystack.deepset.ai/docs/astradocumentstore)
32	- [Chroma](https://docs.haystack.deepset.ai/docs/chromadocumentstore) limited support, overlapping information is
33	not stored
34	- [Elasticsearch](https://docs.haystack.deepset.ai/docs/elasticsearch-document-store)
35	- [OpenSearch](https://docs.haystack.deepset.ai/docs/opensearch-document-store)
36	- [Pgvector](https://docs.haystack.deepset.ai/docs/pgvectordocumentstore)
37	- [Pinecone](https://docs.haystack.deepset.ai/docs/pinecone-document-store) limited support, overlapping
38	information is not stored
39	- [Qdrant](https://docs.haystack.deepset.ai/docs/qdrant-document-store)
40	- [Weaviate](https://docs.haystack.deepset.ai/docs/weaviatedocumentstore)
41
42	### Usage example
43
44	```python
45	from haystack import Document
46	from haystack.components.preprocessors import DocumentSplitter
47
48	doc = Document(content="Moonlight shimmered softly, wolves howled nearby, night enveloped everything.")
49
50	splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=0)
51	result = splitter.run(documents=[doc])
52	```
53	"""
54
55	def __init__( # pylint: disable=too-many-positional-arguments	1✔
56	self,
57	split_by: Literal["function", "page", "passage", "sentence", "word", "line"] = "word",
58	split_length: int = 200,
59	split_overlap: int = 0,
60	split_threshold: int = 0,
61	splitting_function: Optional[Callable[[str], List[str]]] = None,
62	):
63	"""
64	Initialize DocumentSplitter.
65
66	:param split_by: The unit for splitting your documents. Choose from `word` for splitting by spaces (" "),
67	`sentence` for splitting by periods ("."), `page` for splitting by form feed ("\\f"),
68	`passage` for splitting by double line breaks ("\\n\\n") or `line` for splitting each line ("\\n").
69	:param split_length: The maximum number of units in each split.
70	:param split_overlap: The number of overlapping units for each split.
71	:param split_threshold: The minimum number of units per split. If a split has fewer units
72	than the threshold, it's attached to the previous split.
73	:param splitting_function: Necessary when `split_by` is set to "function".
74	This is a function which must accept a single `str` as input and return a `list` of `str` as output,
75	representing the chunks after splitting.
76	"""
77
78	self.split_by = split_by	1✔
79	if split_by not in ["function", "page", "passage", "sentence", "word", "line"]:	1✔
80	raise ValueError("split_by must be one of 'function', 'word', 'sentence', 'page', 'passage' or 'line'.")	1✔
81	if split_by == "function" and splitting_function is None:	1✔
82	raise ValueError("When 'split_by' is set to 'function', a valid 'splitting_function' must be provided.")	×
83	if split_length <= 0:	1✔
84	raise ValueError("split_length must be greater than 0.")	1✔
85	self.split_length = split_length	1✔
86	if split_overlap < 0:	1✔
87	raise ValueError("split_overlap must be greater than or equal to 0.")	1✔
88	self.split_overlap = split_overlap	1✔
89	self.split_threshold = split_threshold	1✔
90	self.splitting_function = splitting_function	1✔
91
92	@component.output_types(documents=List[Document])	1✔
93	def run(self, documents: List[Document]):	1✔
94	"""
95	Split documents into smaller parts.
96
97	Splits documents by the unit expressed in `split_by`, with a length of `split_length`
98	and an overlap of `split_overlap`.
99
100	:param documents: The documents to split.
101
102	:returns: A dictionary with the following key:
103	- `documents`: List of documents with the split texts. Each document includes:
104	- A metadata field `source_id` to track the original document.
105	- A metadata field `page_number` to track the original page number.
106	- All other metadata copied from the original document.
107
108	:raises TypeError: if the input is not a list of Documents.
109	:raises ValueError: if the content of a document is None.
110	"""
111
112	if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):	1✔
113	raise TypeError("DocumentSplitter expects a List of Documents as input.")	1✔
114
115	split_docs: List[Document] = []	1✔
116	for doc in documents:	1✔
117	if doc.content is None:	1✔
118	raise ValueError(	1✔
119	f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None."
120	)
121	if doc.content == "":	1✔
122	logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id)	1✔
123	continue	1✔
124	split_docs += self._split(doc)	1✔
125	return {"documents": split_docs}	1✔
126
127	def _split(self, to_split: Document) -> List[Document]:	1✔
128	# We already check this before calling _split but
129	# we need to make linters happy
130	if to_split.content is None:	1✔
131	return []	×
132
133	if self.split_by == "function" and self.splitting_function is not None:	1✔
134	splits = self.splitting_function(to_split.content)	1✔
135	docs: List[Document] = []	1✔
136	for s in splits:	1✔
137	meta = deepcopy(to_split.meta)	1✔
138	meta["source_id"] = to_split.id	1✔
139	docs.append(Document(content=s, meta=meta))	1✔
140	return docs	1✔
141
142	split_at = _SPLIT_BY_MAPPING[self.split_by]	1✔
143	units = to_split.content.split(split_at)	1✔
144	# Add the delimiter back to all units except the last one
145	for i in range(len(units) - 1):	1✔
146	units[i] += split_at	1✔
147
148	text_splits, splits_pages, splits_start_idxs = self._concatenate_units(	1✔
149	units, self.split_length, self.split_overlap, self.split_threshold
150	)
151	metadata = deepcopy(to_split.meta)	1✔
152	metadata["source_id"] = to_split.id	1✔
153	return self._create_docs_from_splits(	1✔
154	text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata
155	)
156
157	def _concatenate_units(	1✔
158	self, elements: List[str], split_length: int, split_overlap: int, split_threshold: int
159	) -> Tuple[List[str], List[int], List[int]]:
160	"""
161	Concatenates the elements into parts of split_length units.
162
163	Keeps track of the original page number that each element belongs. If the length of the current units is less
164	than the pre-defined `split_threshold`, it does not create a new split. Instead, it concatenates the current
165	units with the last split, preventing the creation of excessively small splits.
166	"""
167
168	text_splits: List[str] = []	1✔
169	splits_pages: List[int] = []	1✔
170	splits_start_idxs: List[int] = []	1✔
171	cur_start_idx = 0	1✔
172	cur_page = 1	1✔
173	segments = windowed(elements, n=split_length, step=split_length - split_overlap)	1✔
174
175	for seg in segments:	1✔
176	current_units = [unit for unit in seg if unit is not None]	1✔
177	txt = "".join(current_units)	1✔
178
179	# check if length of current units is below split_threshold
180	if len(current_units) < split_threshold and len(text_splits) > 0:	1✔
181	# concatenate the last split with the current one
182	text_splits[-1] += txt	1✔
183
184	# NOTE: This line skips documents that have content=""
185	elif len(txt) > 0:	1✔
186	text_splits.append(txt)	1✔
187	splits_pages.append(cur_page)	1✔
188	splits_start_idxs.append(cur_start_idx)	1✔
189
190	processed_units = current_units[: split_length - split_overlap]	1✔
191	cur_start_idx += len("".join(processed_units))	1✔
192
193	if self.split_by == "page":	1✔
194	num_page_breaks = len(processed_units)	1✔
195	else:
196	num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)	1✔
197
198	cur_page += num_page_breaks	1✔
199
200	return text_splits, splits_pages, splits_start_idxs	1✔
201
202	def _create_docs_from_splits(	1✔
203	self, text_splits: List[str], splits_pages: List[int], splits_start_idxs: List[int], meta: Dict[str, Any]
204	) -> List[Document]:
205	"""
206	Creates Document objects from splits enriching them with page number and the metadata of the original document.
207	"""
208	documents: List[Document] = []	1✔
209
210	for i, (txt, split_idx) in enumerate(zip(text_splits, splits_start_idxs)):	1✔
211	meta = deepcopy(meta)	1✔
212	doc = Document(content=txt, meta=meta)	1✔
213	doc.meta["page_number"] = splits_pages[i]	1✔
214	doc.meta["split_id"] = i	1✔
215	doc.meta["split_idx_start"] = split_idx	1✔
216	documents.append(doc)	1✔
217
218	if self.split_overlap <= 0:	1✔
219	continue	1✔
220
221	doc.meta["_split_overlap"] = []	1✔
222
223	if i == 0:	1✔
224	continue	1✔
225
226	doc_start_idx = splits_start_idxs[i]	1✔
227	previous_doc = documents[i - 1]	1✔
228	previous_doc_start_idx = splits_start_idxs[i - 1]	1✔
229	self._add_split_overlap_information(doc, doc_start_idx, previous_doc, previous_doc_start_idx)	1✔
230
231	return documents	1✔
232
233	@staticmethod	1✔
234	def _add_split_overlap_information(	1✔
235	current_doc: Document, current_doc_start_idx: int, previous_doc: Document, previous_doc_start_idx: int
236	):
237	"""
238	Adds split overlap information to the current and previous Document's meta.
239
240	:param current_doc: The Document that is being split.
241	:param current_doc_start_idx: The starting index of the current Document.
242	:param previous_doc: The Document that was split before the current Document.
243	:param previous_doc_start_idx: The starting index of the previous Document.
244	"""
245	overlapping_range = (current_doc_start_idx - previous_doc_start_idx, len(previous_doc.content)) # type: ignore	1✔
246
247	if overlapping_range[0] < overlapping_range[1]:	1✔
248	overlapping_str = previous_doc.content[overlapping_range[0] : overlapping_range[1]] # type: ignore	1✔
249
250	if current_doc.content.startswith(overlapping_str): # type: ignore	1✔
251	# add split overlap information to this Document regarding the previous Document
252	current_doc.meta["_split_overlap"].append({"doc_id": previous_doc.id, "range": overlapping_range})	1✔
253
254	# add split overlap information to previous Document regarding this Document
255	overlapping_range = (0, overlapping_range[1] - overlapping_range[0])	1✔
256	previous_doc.meta["_split_overlap"].append({"doc_id": current_doc.id, "range": overlapping_range})	1✔
257
258	def to_dict(self) -> Dict[str, Any]:	1✔
259	"""
260	Serializes the component to a dictionary.
261	"""
262	serialized = default_to_dict(	1✔
263	self,
264	split_by=self.split_by,
265	split_length=self.split_length,
266	split_overlap=self.split_overlap,
267	split_threshold=self.split_threshold,
268	)
269	if self.splitting_function:	1✔
270	serialized["init_parameters"]["splitting_function"] = serialize_callable(self.splitting_function)	1✔
271	return serialized	1✔
272
273	@classmethod	1✔
274	def from_dict(cls, data: Dict[str, Any]) -> "DocumentSplitter":	1✔
275	"""
276	Deserializes the component from a dictionary.
277	"""
278	init_params = data.get("init_parameters", {})	1✔
279
280	splitting_function = init_params.get("splitting_function", None)	1✔
281	if splitting_function:	1✔
282	init_params["splitting_function"] = deserialize_callable(splitting_function)	1✔
283
284	return default_from_dict(cls, data)	1✔

deepset-ai / haystack / 11890937783

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous