10527400361

Committed 23 Aug 2024 02:24PM UTC coverage: 90.21%. Remained the same

Build # 10527400361

Build Type

push

github

Committed by

web-flow

Commit Message

docs: updating DocumentSplitter docstring, adding supported DocumentSores (#8270)

* initial import

* adding Chroma with limited support

* updating

* Update document_splitter.py

* Update haystack/components/preprocessors/document_splitter.py

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* Update haystack/components/preprocessors/document_splitter.py

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* Update haystack/components/preprocessors/document_splitter.py

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* Update document_splitter.py

* linting

* Update haystack/components/preprocessors/document_splitter.py

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

---------

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

Run Details

6975 of 7732 relevant lines covered (90.21%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.96

haystack/components/preprocessors/document_splitter.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from copy import deepcopy
from typing import Dict, List, Literal, Tuple

from more_itertools import windowed

from haystack import Document, component


@component
class DocumentSplitter:
    """
    Splits long documents into smaller chunks.

    This is a common preprocessing step during indexing.
    It helps Embedders create meaningful semantic representations
    and prevents exceeding language model context limits.

    The DocumentSplitter is compatible with the following DocumentStores:
    - (Astra)[https://docs.haystack.deepset.ai/docs/astradocumentstore]
    - (Chroma)[https://docs.haystack.deepset.ai/docs/chromadocumentstore] limited support, overlapping information is
      not stored
    - (Elasticsearch)[https://docs.haystack.deepset.ai/docs/elasticsearch-document-store]
    - (OpenSearch)[https://docs.haystack.deepset.ai/docs/opensearch-document-store]
    - (Pgvector)[https://docs.haystack.deepset.ai/docs/pgvectordocumentstore]
    - (Pinecone)[https://docs.haystack.deepset.ai/docs/pinecone-document-store] limited support, overlapping
       information is not stored
    - (Qdrant)[https://docs.haystack.deepset.ai/docs/qdrant-document-store]
    - (Weaviate)[https://docs.haystack.deepset.ai/docs/weaviatedocumentstore]

    ### Usage example

    ```python
    from haystack import Document
    from haystack.components.preprocessors import DocumentSplitter

    doc = Document(content="Moonlight shimmered softly, wolves howled nearby, night enveloped everything.")

    splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=0)
    result = splitter.run(documents=[doc])
    ```
    """

    def __init__(
        self,
        split_by: Literal["word", "sentence", "page", "passage"] = "word",
        split_length: int = 200,
        split_overlap: int = 0,
        split_threshold: int = 0,
    ):
        """
        Initialize DocumentSplitter.

        :param split_by: The unit for splitting your documents. Choose from `word` for splitting by spaces (" "),
            `sentence` for splitting by periods ("."), `page` for splitting by form feed ("\\f"),
            or `passage` for splitting by double line breaks ("\\n\\n").
        :param split_length: The maximum number of units in each split.
        :param split_overlap: The number of overlapping units for each split.
        :param split_threshold: The minimum number of units per split. If a split has fewer units
            than the threshold, it's attached to the previous split.
        """

        self.split_by = split_by
        if split_by not in ["word", "sentence", "page", "passage"]:
            raise ValueError("split_by must be one of 'word', 'sentence', 'page' or 'passage'.")
        if split_length <= 0:
            raise ValueError("split_length must be greater than 0.")
        self.split_length = split_length
        if split_overlap < 0:
            raise ValueError("split_overlap must be greater than or equal to 0.")
        self.split_overlap = split_overlap
        self.split_threshold = split_threshold

    @component.output_types(documents=List[Document])
    def run(self, documents: List[Document]):
        """
        Split documents into smaller parts.

        Splits documents by the unit expressed in `split_by`, with a length of `split_length`
        and an overlap of `split_overlap`.

        :param documents: The documents to split.

        :returns: A dictionary with the following key:
            - `documents`: List of documents with the split texts. Each document includes:
                - A metadata field `source_id` to track the original document.
                - A metadata field `page_number` to track the original page number.
                - All other metadata copied from the original document.

        :raises TypeError: if the input is not a list of Documents.
        :raises ValueError: if the content of a document is None.
        """

        if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
            raise TypeError("DocumentSplitter expects a List of Documents as input.")

        split_docs = []
        for doc in documents:
            if doc.content is None:
                raise ValueError(
                    f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None."
                )
            units = self._split_into_units(doc.content, self.split_by)
            text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
                units, self.split_length, self.split_overlap, self.split_threshold
            )
            metadata = deepcopy(doc.meta)
            metadata["source_id"] = doc.id
            split_docs += self._create_docs_from_splits(
                text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata
            )
        return {"documents": split_docs}

    def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]:
        if split_by == "page":
            self.split_at = "\f"
        elif split_by == "passage":
            self.split_at = "\n\n"
        elif split_by == "sentence":
            self.split_at = "."
        elif split_by == "word":
            self.split_at = " "
        else:
            raise NotImplementedError(
                "DocumentSplitter only supports 'word', 'sentence', 'page' or 'passage' split_by options."
            )
        units = text.split(self.split_at)
        # Add the delimiter back to all units except the last one
        for i in range(len(units) - 1):
            units[i] += self.split_at
        return units

    def _concatenate_units(
        self, elements: List[str], split_length: int, split_overlap: int, split_threshold: int
    ) -> Tuple[List[str], List[int], List[int]]:
        """
        Concatenates the elements into parts of split_length units.

        Keeps track of the original page number that each element belongs. If the length of the current units is less
        than the pre-defined `split_threshold`, it does not create a new split. Instead, it concatenates the current
        units with the last split, preventing the creation of excessively small splits.
        """

        text_splits: List[str] = []
        splits_pages = []
        splits_start_idxs = []
        cur_start_idx = 0
        cur_page = 1
        segments = windowed(elements, n=split_length, step=split_length - split_overlap)

        for seg in segments:
            current_units = [unit for unit in seg if unit is not None]
            txt = "".join(current_units)

            # check if length of current units is below split_threshold
            if len(current_units) < split_threshold and len(text_splits) > 0:
                # concatenate the last split with the current one
                text_splits[-1] += txt

            elif len(txt) > 0:
                text_splits.append(txt)
                splits_pages.append(cur_page)
                splits_start_idxs.append(cur_start_idx)

            processed_units = current_units[: split_length - split_overlap]
            cur_start_idx += len("".join(processed_units))

            if self.split_by == "page":
                num_page_breaks = len(processed_units)
            else:
                num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)

            cur_page += num_page_breaks

        return text_splits, splits_pages, splits_start_idxs

    def _create_docs_from_splits(
        self, text_splits: List[str], splits_pages: List[int], splits_start_idxs: List[int], meta: Dict
    ) -> List[Document]:
        """
        Creates Document objects from splits enriching them with page number and the metadata of the original document.
        """
        documents: List[Document] = []

        for i, (txt, split_idx) in enumerate(zip(text_splits, splits_start_idxs)):
            meta = deepcopy(meta)
            doc = Document(content=txt, meta=meta)
            doc.meta["page_number"] = splits_pages[i]
            doc.meta["split_id"] = i
            doc.meta["split_idx_start"] = split_idx
            documents.append(doc)

            if self.split_overlap <= 0:
                continue

            doc.meta["_split_overlap"] = []

            if i == 0:
                continue

            doc_start_idx = splits_start_idxs[i]
            previous_doc = documents[i - 1]
            previous_doc_start_idx = splits_start_idxs[i - 1]
            self._add_split_overlap_information(doc, doc_start_idx, previous_doc, previous_doc_start_idx)

        return documents

    @staticmethod
    def _add_split_overlap_information(
        current_doc: Document, current_doc_start_idx: int, previous_doc: Document, previous_doc_start_idx: int
    ):
        """
        Adds split overlap information to the current and previous Document's meta.

        :param current_doc: The Document that is being split.
        :param current_doc_start_idx: The starting index of the current Document.
        :param previous_doc: The Document that was split before the current Document.
        :param previous_doc_start_idx: The starting index of the previous Document.
        """
        overlapping_range = (current_doc_start_idx - previous_doc_start_idx, len(previous_doc.content))  # type: ignore

        if overlapping_range[0] < overlapping_range[1]:
            overlapping_str = previous_doc.content[overlapping_range[0] : overlapping_range[1]]  # type: ignore

            if current_doc.content.startswith(overlapping_str):  # type: ignore
                # add split overlap information to this Document regarding the previous Document
                current_doc.meta["_split_overlap"].append({"doc_id": previous_doc.id, "range": overlapping_range})

                # add split overlap information to previous Document regarding this Document
                overlapping_range = (0, overlapping_range[1] - overlapping_range[0])
                previous_doc.meta["_split_overlap"].append({"doc_id": current_doc.id, "range": overlapping_range})

1	# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2	#
3	# SPDX-License-Identifier: Apache-2.0
4
5	from copy import deepcopy	1✔
6	from typing import Dict, List, Literal, Tuple	1✔
7
8	from more_itertools import windowed	1✔
9
10	from haystack import Document, component	1✔
11
12
13	@component	1✔
14	class DocumentSplitter:	1✔
15	"""
16	Splits long documents into smaller chunks.
17
18	This is a common preprocessing step during indexing.
19	It helps Embedders create meaningful semantic representations
20	and prevents exceeding language model context limits.
21
22	The DocumentSplitter is compatible with the following DocumentStores:
23	- (Astra)[https://docs.haystack.deepset.ai/docs/astradocumentstore]
24	- (Chroma)[https://docs.haystack.deepset.ai/docs/chromadocumentstore] limited support, overlapping information is
25	not stored
26	- (Elasticsearch)[https://docs.haystack.deepset.ai/docs/elasticsearch-document-store]
27	- (OpenSearch)[https://docs.haystack.deepset.ai/docs/opensearch-document-store]
28	- (Pgvector)[https://docs.haystack.deepset.ai/docs/pgvectordocumentstore]
29	- (Pinecone)[https://docs.haystack.deepset.ai/docs/pinecone-document-store] limited support, overlapping
30	information is not stored
31	- (Qdrant)[https://docs.haystack.deepset.ai/docs/qdrant-document-store]
32	- (Weaviate)[https://docs.haystack.deepset.ai/docs/weaviatedocumentstore]
33
34	### Usage example
35
36	```python
37	from haystack import Document
38	from haystack.components.preprocessors import DocumentSplitter
39
40	doc = Document(content="Moonlight shimmered softly, wolves howled nearby, night enveloped everything.")
41
42	splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=0)
43	result = splitter.run(documents=[doc])
44	```
45	"""
46
47	def __init__(	1✔
48	self,
49	split_by: Literal["word", "sentence", "page", "passage"] = "word",
50	split_length: int = 200,
51	split_overlap: int = 0,
52	split_threshold: int = 0,
53	):
54	"""
55	Initialize DocumentSplitter.
56
57	:param split_by: The unit for splitting your documents. Choose from `word` for splitting by spaces (" "),
58	`sentence` for splitting by periods ("."), `page` for splitting by form feed ("\\f"),
59	or `passage` for splitting by double line breaks ("\\n\\n").
60	:param split_length: The maximum number of units in each split.
61	:param split_overlap: The number of overlapping units for each split.
62	:param split_threshold: The minimum number of units per split. If a split has fewer units
63	than the threshold, it's attached to the previous split.
64	"""
65
66	self.split_by = split_by	1✔
67	if split_by not in ["word", "sentence", "page", "passage"]:	1✔
68	raise ValueError("split_by must be one of 'word', 'sentence', 'page' or 'passage'.")	1✔
69	if split_length <= 0:	1✔
70	raise ValueError("split_length must be greater than 0.")	1✔
71	self.split_length = split_length	1✔
72	if split_overlap < 0:	1✔
73	raise ValueError("split_overlap must be greater than or equal to 0.")	1✔
74	self.split_overlap = split_overlap	1✔
75	self.split_threshold = split_threshold	1✔
76
77	@component.output_types(documents=List[Document])	1✔
78	def run(self, documents: List[Document]):	1✔
79	"""
80	Split documents into smaller parts.
81
82	Splits documents by the unit expressed in `split_by`, with a length of `split_length`
83	and an overlap of `split_overlap`.
84
85	:param documents: The documents to split.
86
87	:returns: A dictionary with the following key:
88	- `documents`: List of documents with the split texts. Each document includes:
89	- A metadata field `source_id` to track the original document.
90	- A metadata field `page_number` to track the original page number.
91	- All other metadata copied from the original document.
92
93	:raises TypeError: if the input is not a list of Documents.
94	:raises ValueError: if the content of a document is None.
95	"""
96
97	if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):	1✔
98	raise TypeError("DocumentSplitter expects a List of Documents as input.")	1✔
99
100	split_docs = []	1✔
101	for doc in documents:	1✔
102	if doc.content is None:	1✔
103	raise ValueError(	1✔
104	f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None."
105	)
106	units = self._split_into_units(doc.content, self.split_by)	1✔
107	text_splits, splits_pages, splits_start_idxs = self._concatenate_units(	1✔
108	units, self.split_length, self.split_overlap, self.split_threshold
109	)
110	metadata = deepcopy(doc.meta)	1✔
111	metadata["source_id"] = doc.id	1✔
112	split_docs += self._create_docs_from_splits(	1✔
113	text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata
114	)
115	return {"documents": split_docs}	1✔
116
117	def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]:	1✔
118	if split_by == "page":	1✔
119	self.split_at = "\f"	1✔
120	elif split_by == "passage":	1✔
121	self.split_at = "\n\n"	1✔
122	elif split_by == "sentence":	1✔
123	self.split_at = "."	1✔
124	elif split_by == "word":	1✔
125	self.split_at = " "	1✔
126	else:
127	raise NotImplementedError(	×
128	"DocumentSplitter only supports 'word', 'sentence', 'page' or 'passage' split_by options."
129	)
130	units = text.split(self.split_at)	1✔
131	# Add the delimiter back to all units except the last one
132	for i in range(len(units) - 1):	1✔
133	units[i] += self.split_at	1✔
134	return units	1✔
135
136	def _concatenate_units(	1✔
137	self, elements: List[str], split_length: int, split_overlap: int, split_threshold: int
138	) -> Tuple[List[str], List[int], List[int]]:
139	"""
140	Concatenates the elements into parts of split_length units.
141
142	Keeps track of the original page number that each element belongs. If the length of the current units is less
143	than the pre-defined `split_threshold`, it does not create a new split. Instead, it concatenates the current
144	units with the last split, preventing the creation of excessively small splits.
145	"""
146
147	text_splits: List[str] = []	1✔
148	splits_pages = []	1✔
149	splits_start_idxs = []	1✔
150	cur_start_idx = 0	1✔
151	cur_page = 1	1✔
152	segments = windowed(elements, n=split_length, step=split_length - split_overlap)	1✔
153
154	for seg in segments:	1✔
155	current_units = [unit for unit in seg if unit is not None]	1✔
156	txt = "".join(current_units)	1✔
157
158	# check if length of current units is below split_threshold
159	if len(current_units) < split_threshold and len(text_splits) > 0:	1✔
160	# concatenate the last split with the current one
161	text_splits[-1] += txt	1✔
162
163	elif len(txt) > 0:	1✔
164	text_splits.append(txt)	1✔
165	splits_pages.append(cur_page)	1✔
166	splits_start_idxs.append(cur_start_idx)	1✔
167
168	processed_units = current_units[: split_length - split_overlap]	1✔
169	cur_start_idx += len("".join(processed_units))	1✔
170
171	if self.split_by == "page":	1✔
172	num_page_breaks = len(processed_units)	1✔
173	else:
174	num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)	1✔
175
176	cur_page += num_page_breaks	1✔
177
178	return text_splits, splits_pages, splits_start_idxs	1✔
179
180	def _create_docs_from_splits(	1✔
181	self, text_splits: List[str], splits_pages: List[int], splits_start_idxs: List[int], meta: Dict
182	) -> List[Document]:
183	"""
184	Creates Document objects from splits enriching them with page number and the metadata of the original document.
185	"""
186	documents: List[Document] = []	1✔
187
188	for i, (txt, split_idx) in enumerate(zip(text_splits, splits_start_idxs)):	1✔
189	meta = deepcopy(meta)	1✔
190	doc = Document(content=txt, meta=meta)	1✔
191	doc.meta["page_number"] = splits_pages[i]	1✔
192	doc.meta["split_id"] = i	1✔
193	doc.meta["split_idx_start"] = split_idx	1✔
194	documents.append(doc)	1✔
195
196	if self.split_overlap <= 0:	1✔
197	continue	1✔
198
199	doc.meta["_split_overlap"] = []	1✔
200
201	if i == 0:	1✔
202	continue	1✔
203
204	doc_start_idx = splits_start_idxs[i]	1✔
205	previous_doc = documents[i - 1]	1✔
206	previous_doc_start_idx = splits_start_idxs[i - 1]	1✔
207	self._add_split_overlap_information(doc, doc_start_idx, previous_doc, previous_doc_start_idx)	1✔
208
209	return documents	1✔
210
211	@staticmethod	1✔
212	def _add_split_overlap_information(	1✔
213	current_doc: Document, current_doc_start_idx: int, previous_doc: Document, previous_doc_start_idx: int
214	):
215	"""
216	Adds split overlap information to the current and previous Document's meta.
217
218	:param current_doc: The Document that is being split.
219	:param current_doc_start_idx: The starting index of the current Document.
220	:param previous_doc: The Document that was split before the current Document.
221	:param previous_doc_start_idx: The starting index of the previous Document.
222	"""
223	overlapping_range = (current_doc_start_idx - previous_doc_start_idx, len(previous_doc.content)) # type: ignore	1✔
224
225	if overlapping_range[0] < overlapping_range[1]:	1✔
226	overlapping_str = previous_doc.content[overlapping_range[0] : overlapping_range[1]] # type: ignore	1✔
227
228	if current_doc.content.startswith(overlapping_str): # type: ignore	1✔
229	# add split overlap information to this Document regarding the previous Document
230	current_doc.meta["_split_overlap"].append({"doc_id": previous_doc.id, "range": overlapping_range})	1✔
231
232	# add split overlap information to previous Document regarding this Document
233	overlapping_range = (0, overlapping_range[1] - overlapping_range[0])	1✔
234	previous_doc.meta["_split_overlap"].append({"doc_id": current_doc.id, "range": overlapping_range})	1✔

deepset-ai / haystack / 10527400361

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous