12253750863

Committed 10 Dec 2024 10:03AM UTC coverage: 90.364% (+0.03%) from 90.335%

Build # 12253750863

Build Type

Pull #8617

github

Committed by

web-flow

Commit Message

Merge 9d682f541 into 248dccbdd

Pull Request Pull Request #8617: feat: unify NLTKDocumentSplitter and DocumentSplitter

Run Details

8009 of 8863 relevant lines covered (90.36%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.99

haystack/components/preprocessors/document_splitter.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from copy import deepcopy
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple

from more_itertools import windowed

from haystack import Document, component, logging
from haystack.components.preprocessors.sentence_tokenizer import Language, SentenceSplitter, nltk_imports
from haystack.core.serialization import default_from_dict, default_to_dict
from haystack.utils import deserialize_callable, serialize_callable

logger = logging.getLogger(__name__)

_SPLIT_BY_MAPPING = {"page": "\f", "passage": "\n\n", "sentence": ".", "word": " ", "line": "\n"}


@component
class DocumentSplitter:
    """
    Splits long documents into smaller chunks.

    This is a common preprocessing step during indexing. It helps Embedders create meaningful semantic representations
    and prevents exceeding language model context limits.

    The DocumentSplitter is compatible with the following DocumentStores:
    - [Astra](https://docs.haystack.deepset.ai/docs/astradocumentstore)
    - [Chroma](https://docs.haystack.deepset.ai/docs/chromadocumentstore) limited support, overlapping information is
      not stored
    - [Elasticsearch](https://docs.haystack.deepset.ai/docs/elasticsearch-document-store)
    - [OpenSearch](https://docs.haystack.deepset.ai/docs/opensearch-document-store)
    - [Pgvector](https://docs.haystack.deepset.ai/docs/pgvectordocumentstore)
    - [Pinecone](https://docs.haystack.deepset.ai/docs/pinecone-document-store) limited support, overlapping
       information is not stored
    - [Qdrant](https://docs.haystack.deepset.ai/docs/qdrant-document-store)
    - [Weaviate](https://docs.haystack.deepset.ai/docs/weaviatedocumentstore)

    ### Usage example

    ```python
    from haystack import Document
    from haystack.components.preprocessors import DocumentSplitter

    doc = Document(content="Moonlight shimmered softly, wolves howled nearby, night enveloped everything.")

    splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=0)
    result = splitter.run(documents=[doc])
    ```
    """

    def __init__(  # pylint: disable=too-many-positional-arguments
        self,
        split_by: Literal["function", "page", "passage", "sentence", "word", "line", "nltk_sentence"] = "word",
        split_length: int = 200,
        split_overlap: int = 0,
        split_threshold: int = 0,
        splitting_function: Optional[Callable[[str], List[str]]] = None,
        respect_sentence_boundary: bool = False,
        language: Language = "en",
        use_split_rules: bool = True,
        extend_abbreviations: bool = True,
    ):
        """
        Initialize DocumentSplitter.

        :param split_by: The unit for splitting your documents. Choose from:
            - `word` for splitting by spaces (" ")
            - `sentence` for splitting by periods (".")
            - `page` for splitting by form feed ("\\f")
            - `passage` for splitting by double line breaks ("\\n\\n")
            - `line` for splitting each line ("\\n")
            - `nltk_sentence` for splitting by NLTK sentence tokenizer

        :param split_length: The maximum number of units in each split.
        :param split_overlap: The number of overlapping units for each split.
        :param split_threshold: The minimum number of units per split. If a split has fewer units
            than the threshold, it's attached to the previous split.
        :param splitting_function: Necessary when `split_by` is set to "function".
            This is a function which must accept a single `str` as input and return a `list` of `str` as output,
            representing the chunks after splitting.
        :param respect_sentence_boundary: Choose whether to respect sentence boundaries when splitting by "word".
            If True, uses NLTK to detect sentence boundaries, ensuring splits occur only between sentences.
        :param language: Choose the language for the NLTK tokenizer. The default is English ("en").
        :param use_split_rules: Choose whether to use additional split rules when splitting by `sentence`.
        :param extend_abbreviations: Choose whether to extend NLTK's PunktTokenizer abbreviations with a list
            of curated abbreviations, if available. This is currently supported for English ("en") and German ("de").
        """

        self._init_checks(
            split_by=split_by,
            split_length=split_length,
            split_overlap=split_overlap,
            splitting_function=splitting_function,
            respect_sentence_boundary=respect_sentence_boundary,
        )

        self.split_by = split_by
        self.split_length = split_length
        self.split_overlap = split_overlap
        self.split_threshold = split_threshold
        self.splitting_function = splitting_function
        self.respect_sentence_boundary = respect_sentence_boundary
        self.use_split_rules = use_split_rules
        self.extend_abbreviations = extend_abbreviations

        if split_by == "nltk_sentence" or respect_sentence_boundary and split_by == "word":
            nltk_imports.check()
            self.sentence_splitter = SentenceSplitter(
                language=language,
                use_split_rules=use_split_rules,
                extend_abbreviations=extend_abbreviations,
                keep_white_spaces=True,
            )
            self.language = language

    @staticmethod
    def _init_checks(
        split_by: str,
        split_length: int,
        split_overlap: int,
        splitting_function: Optional[Callable],
        respect_sentence_boundary: bool,
    ) -> None:
        """
        Validates initialization parameters for DocumentSplitter.

        :param split_by: The unit for splitting documents
        :param split_length: The maximum number of units in each split
        :param split_overlap: The number of overlapping units for each split
        :param splitting_function: Custom function for splitting when split_by="function"
        :param respect_sentence_boundary: Whether to respect sentence boundaries when splitting
        :raises ValueError: If any parameter is invalid
        """
        valid_split_by = ["function", "page", "passage", "sentence", "word", "line", "nltk_sentence"]
        if split_by not in valid_split_by:
            raise ValueError(f"split_by must be one of {', '.join(valid_split_by)}.")

        if split_by == "function" and splitting_function is None:
            raise ValueError("When 'split_by' is set to 'function', a valid 'splitting_function' must be provided.")

        if split_length <= 0:
            raise ValueError("split_length must be greater than 0.")

        if split_overlap < 0:
            raise ValueError("split_overlap must be greater than or equal to 0.")

        if respect_sentence_boundary and split_by != "word":
            logger.warning(
                "The 'respect_sentence_boundary' option is only supported for `split_by='word'`. "
                "The option `respect_sentence_boundary` will be set to `False`."
            )

    @component.output_types(documents=List[Document])
    def run(self, documents: List[Document]):
        """
        Split documents into smaller parts.

        Splits documents by the unit expressed in `split_by`, with a length of `split_length`
        and an overlap of `split_overlap`.

        :param documents: The documents to split.
        :returns: A dictionary with the following key:
            - `documents`: List of documents with the split texts. Each document includes:
                - A metadata field `source_id` to track the original document.
                - A metadata field `page_number` to track the original page number.
                - All other metadata copied from the original document.

        :raises TypeError: if the input is not a list of Documents.
        :raises ValueError: if the content of a document is None.
        """

        if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
            raise TypeError("DocumentSplitter expects a List of Documents as input.")

        split_docs: List[Document] = []
        for doc in documents:
            if doc.content is None:
                raise ValueError(
                    f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None."
                )
            if doc.content == "":
                logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id)
                continue

            split_docs += self._split_document(doc)
        return {"documents": split_docs}

    def _split_document(self, doc: Document) -> List[Document]:
        if self.split_by == "nltk_sentence" or self.respect_sentence_boundary:
            return self._split_by_nltk_sentence(doc)

        if self.split_by == "function" and self.splitting_function is not None:
            return self._split_by_function(doc)

        return self._split_by_character(doc)

    def _split_by_nltk_sentence(self, doc: Document) -> List[Document]:
        if doc.content is None:
            return []

        split_docs = []
        # whitespace is preserved while splitting text into sentences when using keep_white_spaces=True
        # so split_at is set to an empty string
        self.split_at = ""
        result = self.sentence_splitter.split_sentences(doc.content)
        units = [sentence["sentence"] for sentence in result]

        if self.respect_sentence_boundary:
            text_splits, splits_pages, splits_start_idxs = self._concatenate_sentences_based_on_word_amount(
                sentences=units, split_length=self.split_length, split_overlap=self.split_overlap
            )
        else:
            text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
                elements=units,
                split_length=self.split_length,
                split_overlap=self.split_overlap,
                split_threshold=self.split_threshold,
            )
        metadata = deepcopy(doc.meta)
        metadata["source_id"] = doc.id
        split_docs += self._create_docs_from_splits(
            text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata
        )

        return split_docs

    def _split_by_character(self, doc) -> List[Document]:
        split_at = _SPLIT_BY_MAPPING[self.split_by]
        units = doc.content.split(split_at)
        # Add the delimiter back to all units except the last one
        for i in range(len(units) - 1):
            units[i] += split_at
        text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
            units, self.split_length, self.split_overlap, self.split_threshold
        )
        metadata = deepcopy(doc.meta)
        metadata["source_id"] = doc.id
        return self._create_docs_from_splits(
            text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata
        )

    def _split_by_function(self, doc) -> List[Document]:
        # the check for None is done already in the run method
        splits = self.splitting_function(doc.content)  # type: ignore
        docs: List[Document] = []
        for s in splits:
            meta = deepcopy(doc.meta)
            meta["source_id"] = doc.id
            docs.append(Document(content=s, meta=meta))
        return docs

    def _concatenate_units(
        self, elements: List[str], split_length: int, split_overlap: int, split_threshold: int
    ) -> Tuple[List[str], List[int], List[int]]:
        """
        Concatenates the elements into parts of split_length units.

        Keeps track of the original page number that each element belongs. If the length of the current units is less
        than the pre-defined `split_threshold`, it does not create a new split. Instead, it concatenates the current
        units with the last split, preventing the creation of excessively small splits.
        """

        text_splits: List[str] = []
        splits_pages: List[int] = []
        splits_start_idxs: List[int] = []
        cur_start_idx = 0
        cur_page = 1
        segments = windowed(elements, n=split_length, step=split_length - split_overlap)

        for seg in segments:
            current_units = [unit for unit in seg if unit is not None]
            txt = "".join(current_units)

            # check if length of current units is below split_threshold
            if len(current_units) < split_threshold and len(text_splits) > 0:
                # concatenate the last split with the current one
                text_splits[-1] += txt

            # NOTE: This line skips documents that have content=""
            elif len(txt) > 0:
                text_splits.append(txt)
                splits_pages.append(cur_page)
                splits_start_idxs.append(cur_start_idx)

            processed_units = current_units[: split_length - split_overlap]
            cur_start_idx += len("".join(processed_units))

            if self.split_by == "page":
                num_page_breaks = len(processed_units)
            else:
                num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)

            cur_page += num_page_breaks

        return text_splits, splits_pages, splits_start_idxs

    def _create_docs_from_splits(
        self, text_splits: List[str], splits_pages: List[int], splits_start_idxs: List[int], meta: Dict[str, Any]
    ) -> List[Document]:
        """
        Creates Document objects from splits enriching them with page number and the metadata of the original document.
        """
        documents: List[Document] = []

        for i, (txt, split_idx) in enumerate(zip(text_splits, splits_start_idxs)):
            meta = deepcopy(meta)
            doc = Document(content=txt, meta=meta)
            doc.meta["page_number"] = splits_pages[i]
            doc.meta["split_id"] = i
            doc.meta["split_idx_start"] = split_idx
            documents.append(doc)

            if self.split_overlap <= 0:
                continue

            doc.meta["_split_overlap"] = []

            if i == 0:
                continue

            doc_start_idx = splits_start_idxs[i]
            previous_doc = documents[i - 1]
            previous_doc_start_idx = splits_start_idxs[i - 1]
            self._add_split_overlap_information(doc, doc_start_idx, previous_doc, previous_doc_start_idx)

        return documents

    @staticmethod
    def _add_split_overlap_information(
        current_doc: Document, current_doc_start_idx: int, previous_doc: Document, previous_doc_start_idx: int
    ):
        """
        Adds split overlap information to the current and previous Document's meta.

        :param current_doc: The Document that is being split.
        :param current_doc_start_idx: The starting index of the current Document.
        :param previous_doc: The Document that was split before the current Document.
        :param previous_doc_start_idx: The starting index of the previous Document.
        """
        overlapping_range = (current_doc_start_idx - previous_doc_start_idx, len(previous_doc.content))  # type: ignore

        if overlapping_range[0] < overlapping_range[1]:
            overlapping_str = previous_doc.content[overlapping_range[0] : overlapping_range[1]]  # type: ignore

            if current_doc.content.startswith(overlapping_str):  # type: ignore
                # add split overlap information to this Document regarding the previous Document
                current_doc.meta["_split_overlap"].append({"doc_id": previous_doc.id, "range": overlapping_range})

                # add split overlap information to previous Document regarding this Document
                overlapping_range = (0, overlapping_range[1] - overlapping_range[0])
                previous_doc.meta["_split_overlap"].append({"doc_id": current_doc.id, "range": overlapping_range})

    def to_dict(self) -> Dict[str, Any]:
        """
        Serializes the component to a dictionary.
        """
        serialized = default_to_dict(
            self,
            split_by=self.split_by,
            split_length=self.split_length,
            split_overlap=self.split_overlap,
            split_threshold=self.split_threshold,
        )
        if self.splitting_function:
            serialized["init_parameters"]["splitting_function"] = serialize_callable(self.splitting_function)
        return serialized

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "DocumentSplitter":
        """
        Deserializes the component from a dictionary.
        """
        init_params = data.get("init_parameters", {})

        splitting_function = init_params.get("splitting_function", None)
        if splitting_function:
            init_params["splitting_function"] = deserialize_callable(splitting_function)

        return default_from_dict(cls, data)

    def _concatenate_sentences_based_on_word_amount(
        self, sentences: List[str], split_length: int, split_overlap: int
    ) -> Tuple[List[str], List[int], List[int]]:
        """
        Groups the sentences into chunks of `split_length` words while respecting sentence boundaries.

        This function is only used when splitting by `word` and `respect_sentence_boundary` is set to `True`, i.e.:
        with NLTK sentence tokenizer.

        :param sentences: The list of sentences to split.
        :param split_length: The maximum number of words in each split.
        :param split_overlap: The number of overlapping words in each split.
        :returns: A tuple containing the concatenated sentences, the start page numbers, and the start indices.
        """
        # Chunk information
        chunk_word_count = 0
        chunk_starting_page_number = 1
        chunk_start_idx = 0
        current_chunk: List[str] = []
        # Output lists
        split_start_page_numbers = []
        list_of_splits: List[List[str]] = []
        split_start_indices = []

        for sentence_idx, sentence in enumerate(sentences):
            current_chunk.append(sentence)
            chunk_word_count += len(sentence.split())
            next_sentence_word_count = (
                len(sentences[sentence_idx + 1].split()) if sentence_idx < len(sentences) - 1 else 0
            )

            # Number of words in the current chunk plus the next sentence is larger than the split_length,
            # or we reached the last sentence
            if (chunk_word_count + next_sentence_word_count) > split_length or sentence_idx == len(sentences) - 1:
                #  Save current chunk and start a new one
                list_of_splits.append(current_chunk)
                split_start_page_numbers.append(chunk_starting_page_number)
                split_start_indices.append(chunk_start_idx)

                # Get the number of sentences that overlap with the next chunk
                num_sentences_to_keep = DocumentSplitter._number_of_sentences_to_keep(
                    sentences=current_chunk, split_length=split_length, split_overlap=split_overlap
                )
                # Set up information for the new chunk
                if num_sentences_to_keep > 0:
                    # Processed sentences are the ones that are not overlapping with the next chunk
                    processed_sentences = current_chunk[:-num_sentences_to_keep]
                    chunk_starting_page_number += sum(sent.count("\f") for sent in processed_sentences)
                    chunk_start_idx += len("".join(processed_sentences))
                    # Next chunk starts with the sentences that were overlapping with the previous chunk
                    current_chunk = current_chunk[-num_sentences_to_keep:]
                    chunk_word_count = sum(len(s.split()) for s in current_chunk)
                else:
                    # Here processed_sentences is the same as current_chunk since there is no overlap
                    chunk_starting_page_number += sum(sent.count("\f") for sent in current_chunk)
                    chunk_start_idx += len("".join(current_chunk))
                    current_chunk = []
                    chunk_word_count = 0

        # Concatenate the sentences together within each split
        text_splits = []
        for split in list_of_splits:
            text = "".join(split)
            if len(text) > 0:
                text_splits.append(text)

        return text_splits, split_start_page_numbers, split_start_indices

    @staticmethod
    def _number_of_sentences_to_keep(sentences: List[str], split_length: int, split_overlap: int) -> int:
        """
        Returns the number of sentences to keep in the next chunk based on the `split_overlap` and `split_length`.

        :param sentences: The list of sentences to split.
        :param split_length: The maximum number of words in each split.
        :param split_overlap: The number of overlapping words in each split.
        :returns: The number of sentences to keep in the next chunk.
        """
        # If the split_overlap is 0, we don't need to keep any sentences
        if split_overlap == 0:
            return 0

        num_sentences_to_keep = 0
        num_words = 0
        # Next overlapping Document should not start exactly the same as the previous one, so we skip the first sentence
        for sent in reversed(sentences[1:]):
            num_words += len(sent.split())
            # If the number of words is larger than the split_length then don't add any more sentences
            if num_words > split_length:
                break
            num_sentences_to_keep += 1
            if num_words > split_overlap:
                break
        return num_sentences_to_keep

1	# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2	#
3	# SPDX-License-Identifier: Apache-2.0
4
5	from copy import deepcopy	1✔
6	from typing import Any, Callable, Dict, List, Literal, Optional, Tuple	1✔
7
8	from more_itertools import windowed	1✔
9
10	from haystack import Document, component, logging	1✔
11	from haystack.components.preprocessors.sentence_tokenizer import Language, SentenceSplitter, nltk_imports	1✔
12	from haystack.core.serialization import default_from_dict, default_to_dict	1✔
13	from haystack.utils import deserialize_callable, serialize_callable	1✔
14
15	logger = logging.getLogger(__name__)	1✔
16
17	_SPLIT_BY_MAPPING = {"page": "\f", "passage": "\n\n", "sentence": ".", "word": " ", "line": "\n"}	1✔
18
19
20	@component	1✔
21	class DocumentSplitter:	1✔
22	"""
23	Splits long documents into smaller chunks.
24
25	This is a common preprocessing step during indexing. It helps Embedders create meaningful semantic representations
26	and prevents exceeding language model context limits.
27
28	The DocumentSplitter is compatible with the following DocumentStores:
29	- [Astra](https://docs.haystack.deepset.ai/docs/astradocumentstore)
30	- [Chroma](https://docs.haystack.deepset.ai/docs/chromadocumentstore) limited support, overlapping information is
31	not stored
32	- [Elasticsearch](https://docs.haystack.deepset.ai/docs/elasticsearch-document-store)
33	- [OpenSearch](https://docs.haystack.deepset.ai/docs/opensearch-document-store)
34	- [Pgvector](https://docs.haystack.deepset.ai/docs/pgvectordocumentstore)
35	- [Pinecone](https://docs.haystack.deepset.ai/docs/pinecone-document-store) limited support, overlapping
36	information is not stored
37	- [Qdrant](https://docs.haystack.deepset.ai/docs/qdrant-document-store)
38	- [Weaviate](https://docs.haystack.deepset.ai/docs/weaviatedocumentstore)
39
40	### Usage example
41
42	```python
43	from haystack import Document
44	from haystack.components.preprocessors import DocumentSplitter
45
46	doc = Document(content="Moonlight shimmered softly, wolves howled nearby, night enveloped everything.")
47
48	splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=0)
49	result = splitter.run(documents=[doc])
50	```
51	"""
52
53	def __init__( # pylint: disable=too-many-positional-arguments	1✔
54	self,
55	split_by: Literal["function", "page", "passage", "sentence", "word", "line", "nltk_sentence"] = "word",
56	split_length: int = 200,
57	split_overlap: int = 0,
58	split_threshold: int = 0,
59	splitting_function: Optional[Callable[[str], List[str]]] = None,
60	respect_sentence_boundary: bool = False,
61	language: Language = "en",
62	use_split_rules: bool = True,
63	extend_abbreviations: bool = True,
64	):
65	"""
66	Initialize DocumentSplitter.
67
68	:param split_by: The unit for splitting your documents. Choose from:
69	- `word` for splitting by spaces (" ")
70	- `sentence` for splitting by periods (".")
71	- `page` for splitting by form feed ("\\f")
72	- `passage` for splitting by double line breaks ("\\n\\n")
73	- `line` for splitting each line ("\\n")
74	- `nltk_sentence` for splitting by NLTK sentence tokenizer
75
76	:param split_length: The maximum number of units in each split.
77	:param split_overlap: The number of overlapping units for each split.
78	:param split_threshold: The minimum number of units per split. If a split has fewer units
79	than the threshold, it's attached to the previous split.
80	:param splitting_function: Necessary when `split_by` is set to "function".
81	This is a function which must accept a single `str` as input and return a `list` of `str` as output,
82	representing the chunks after splitting.
83	:param respect_sentence_boundary: Choose whether to respect sentence boundaries when splitting by "word".
84	If True, uses NLTK to detect sentence boundaries, ensuring splits occur only between sentences.
85	:param language: Choose the language for the NLTK tokenizer. The default is English ("en").
86	:param use_split_rules: Choose whether to use additional split rules when splitting by `sentence`.
87	:param extend_abbreviations: Choose whether to extend NLTK's PunktTokenizer abbreviations with a list
88	of curated abbreviations, if available. This is currently supported for English ("en") and German ("de").
89	"""
90
91	self._init_checks(	1✔
92	split_by=split_by,
93	split_length=split_length,
94	split_overlap=split_overlap,
95	splitting_function=splitting_function,
96	respect_sentence_boundary=respect_sentence_boundary,
97	)
98
99	self.split_by = split_by	1✔
100	self.split_length = split_length	1✔
101	self.split_overlap = split_overlap	1✔
102	self.split_threshold = split_threshold	1✔
103	self.splitting_function = splitting_function	1✔
104	self.respect_sentence_boundary = respect_sentence_boundary	1✔
105	self.use_split_rules = use_split_rules	1✔
106	self.extend_abbreviations = extend_abbreviations	1✔
107
108	if split_by == "nltk_sentence" or respect_sentence_boundary and split_by == "word":	1✔
109	nltk_imports.check()	1✔
110	self.sentence_splitter = SentenceSplitter(	1✔
111	language=language,
112	use_split_rules=use_split_rules,
113	extend_abbreviations=extend_abbreviations,
114	keep_white_spaces=True,
115	)
116	self.language = language	1✔
117
118	@staticmethod	1✔
119	def _init_checks(	1✔
120	split_by: str,
121	split_length: int,
122	split_overlap: int,
123	splitting_function: Optional[Callable],
124	respect_sentence_boundary: bool,
125	) -> None:
126	"""
127	Validates initialization parameters for DocumentSplitter.
128
129	:param split_by: The unit for splitting documents
130	:param split_length: The maximum number of units in each split
131	:param split_overlap: The number of overlapping units for each split
132	:param splitting_function: Custom function for splitting when split_by="function"
133	:param respect_sentence_boundary: Whether to respect sentence boundaries when splitting
134	:raises ValueError: If any parameter is invalid
135	"""
136	valid_split_by = ["function", "page", "passage", "sentence", "word", "line", "nltk_sentence"]	1✔
137	if split_by not in valid_split_by:	1✔
138	raise ValueError(f"split_by must be one of {', '.join(valid_split_by)}.")	1✔
139
140	if split_by == "function" and splitting_function is None:	1✔
141	raise ValueError("When 'split_by' is set to 'function', a valid 'splitting_function' must be provided.")	1✔
142
143	if split_length <= 0:	1✔
144	raise ValueError("split_length must be greater than 0.")	1✔
145
146	if split_overlap < 0:	1✔
147	raise ValueError("split_overlap must be greater than or equal to 0.")	1✔
148
149	if respect_sentence_boundary and split_by != "word":	1✔
150	logger.warning(	×
151	"The 'respect_sentence_boundary' option is only supported for `split_by='word'`. "
152	"The option `respect_sentence_boundary` will be set to `False`."
153	)
154
155	@component.output_types(documents=List[Document])	1✔
156	def run(self, documents: List[Document]):	1✔
157	"""
158	Split documents into smaller parts.
159
160	Splits documents by the unit expressed in `split_by`, with a length of `split_length`
161	and an overlap of `split_overlap`.
162
163	:param documents: The documents to split.
164	:returns: A dictionary with the following key:
165	- `documents`: List of documents with the split texts. Each document includes:
166	- A metadata field `source_id` to track the original document.
167	- A metadata field `page_number` to track the original page number.
168	- All other metadata copied from the original document.
169
170	:raises TypeError: if the input is not a list of Documents.
171	:raises ValueError: if the content of a document is None.
172	"""
173
174	if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):	1✔
175	raise TypeError("DocumentSplitter expects a List of Documents as input.")	1✔
176
177	split_docs: List[Document] = []	1✔
178	for doc in documents:	1✔
179	if doc.content is None:	1✔
180	raise ValueError(	1✔
181	f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None."
182	)
183	if doc.content == "":	1✔
184	logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id)	1✔
185	continue	1✔
186
187	split_docs += self._split_document(doc)	1✔
188	return {"documents": split_docs}	1✔
189
190	def _split_document(self, doc: Document) -> List[Document]:	1✔
191	if self.split_by == "nltk_sentence" or self.respect_sentence_boundary:	1✔
192	return self._split_by_nltk_sentence(doc)	1✔
193
194	if self.split_by == "function" and self.splitting_function is not None:	1✔
195	return self._split_by_function(doc)	1✔
196
197	return self._split_by_character(doc)	1✔
198
199	def _split_by_nltk_sentence(self, doc: Document) -> List[Document]:	1✔
200	if doc.content is None:	1✔
201	return []	×
202
203	split_docs = []	1✔
204	# whitespace is preserved while splitting text into sentences when using keep_white_spaces=True
205	# so split_at is set to an empty string
206	self.split_at = ""	1✔
207	result = self.sentence_splitter.split_sentences(doc.content)	1✔
208	units = [sentence["sentence"] for sentence in result]	1✔
209
210	if self.respect_sentence_boundary:	1✔
211	text_splits, splits_pages, splits_start_idxs = self._concatenate_sentences_based_on_word_amount(	1✔
212	sentences=units, split_length=self.split_length, split_overlap=self.split_overlap
213	)
214	else:
215	text_splits, splits_pages, splits_start_idxs = self._concatenate_units(	1✔
216	elements=units,
217	split_length=self.split_length,
218	split_overlap=self.split_overlap,
219	split_threshold=self.split_threshold,
220	)
221	metadata = deepcopy(doc.meta)	1✔
222	metadata["source_id"] = doc.id	1✔
223	split_docs += self._create_docs_from_splits(	1✔
224	text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata
225	)
226
227	return split_docs	1✔
228
229	def _split_by_character(self, doc) -> List[Document]:	1✔
230	split_at = _SPLIT_BY_MAPPING[self.split_by]	1✔
231	units = doc.content.split(split_at)	1✔
232	# Add the delimiter back to all units except the last one
233	for i in range(len(units) - 1):	1✔
234	units[i] += split_at	1✔
235	text_splits, splits_pages, splits_start_idxs = self._concatenate_units(	1✔
236	units, self.split_length, self.split_overlap, self.split_threshold
237	)
238	metadata = deepcopy(doc.meta)	1✔
239	metadata["source_id"] = doc.id	1✔
240	return self._create_docs_from_splits(	1✔
241	text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata
242	)
243
244	def _split_by_function(self, doc) -> List[Document]:	1✔
245	# the check for None is done already in the run method
246	splits = self.splitting_function(doc.content) # type: ignore	1✔
247	docs: List[Document] = []	1✔
248	for s in splits:	1✔
249	meta = deepcopy(doc.meta)	1✔
250	meta["source_id"] = doc.id	1✔
251	docs.append(Document(content=s, meta=meta))	1✔
252	return docs	1✔
253
254	def _concatenate_units(	1✔
255	self, elements: List[str], split_length: int, split_overlap: int, split_threshold: int
256	) -> Tuple[List[str], List[int], List[int]]:
257	"""
258	Concatenates the elements into parts of split_length units.
259
260	Keeps track of the original page number that each element belongs. If the length of the current units is less
261	than the pre-defined `split_threshold`, it does not create a new split. Instead, it concatenates the current
262	units with the last split, preventing the creation of excessively small splits.
263	"""
264
265	text_splits: List[str] = []	1✔
266	splits_pages: List[int] = []	1✔
267	splits_start_idxs: List[int] = []	1✔
268	cur_start_idx = 0	1✔
269	cur_page = 1	1✔
270	segments = windowed(elements, n=split_length, step=split_length - split_overlap)	1✔
271
272	for seg in segments:	1✔
273	current_units = [unit for unit in seg if unit is not None]	1✔
274	txt = "".join(current_units)	1✔
275
276	# check if length of current units is below split_threshold
277	if len(current_units) < split_threshold and len(text_splits) > 0:	1✔
278	# concatenate the last split with the current one
279	text_splits[-1] += txt	1✔
280
281	# NOTE: This line skips documents that have content=""
282	elif len(txt) > 0:	1✔
283	text_splits.append(txt)	1✔
284	splits_pages.append(cur_page)	1✔
285	splits_start_idxs.append(cur_start_idx)	1✔
286
287	processed_units = current_units[: split_length - split_overlap]	1✔
288	cur_start_idx += len("".join(processed_units))	1✔
289
290	if self.split_by == "page":	1✔
291	num_page_breaks = len(processed_units)	1✔
292	else:
293	num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)	1✔
294
295	cur_page += num_page_breaks	1✔
296
297	return text_splits, splits_pages, splits_start_idxs	1✔
298
299	def _create_docs_from_splits(	1✔
300	self, text_splits: List[str], splits_pages: List[int], splits_start_idxs: List[int], meta: Dict[str, Any]
301	) -> List[Document]:
302	"""
303	Creates Document objects from splits enriching them with page number and the metadata of the original document.
304	"""
305	documents: List[Document] = []	1✔
306
307	for i, (txt, split_idx) in enumerate(zip(text_splits, splits_start_idxs)):	1✔
308	meta = deepcopy(meta)	1✔
309	doc = Document(content=txt, meta=meta)	1✔
310	doc.meta["page_number"] = splits_pages[i]	1✔
311	doc.meta["split_id"] = i	1✔
312	doc.meta["split_idx_start"] = split_idx	1✔
313	documents.append(doc)	1✔
314
315	if self.split_overlap <= 0:	1✔
316	continue	1✔
317
318	doc.meta["_split_overlap"] = []	1✔
319
320	if i == 0:	1✔
321	continue	1✔
322
323	doc_start_idx = splits_start_idxs[i]	1✔
324	previous_doc = documents[i - 1]	1✔
325	previous_doc_start_idx = splits_start_idxs[i - 1]	1✔
326	self._add_split_overlap_information(doc, doc_start_idx, previous_doc, previous_doc_start_idx)	1✔
327
328	return documents	1✔
329
330	@staticmethod	1✔
331	def _add_split_overlap_information(	1✔
332	current_doc: Document, current_doc_start_idx: int, previous_doc: Document, previous_doc_start_idx: int
333	):
334	"""
335	Adds split overlap information to the current and previous Document's meta.
336
337	:param current_doc: The Document that is being split.
338	:param current_doc_start_idx: The starting index of the current Document.
339	:param previous_doc: The Document that was split before the current Document.
340	:param previous_doc_start_idx: The starting index of the previous Document.
341	"""
342	overlapping_range = (current_doc_start_idx - previous_doc_start_idx, len(previous_doc.content)) # type: ignore	1✔
343
344	if overlapping_range[0] < overlapping_range[1]:	1✔
345	overlapping_str = previous_doc.content[overlapping_range[0] : overlapping_range[1]] # type: ignore	1✔
346
347	if current_doc.content.startswith(overlapping_str): # type: ignore	1✔
348	# add split overlap information to this Document regarding the previous Document
349	current_doc.meta["_split_overlap"].append({"doc_id": previous_doc.id, "range": overlapping_range})	1✔
350
351	# add split overlap information to previous Document regarding this Document
352	overlapping_range = (0, overlapping_range[1] - overlapping_range[0])	1✔
353	previous_doc.meta["_split_overlap"].append({"doc_id": current_doc.id, "range": overlapping_range})	1✔
354
355	def to_dict(self) -> Dict[str, Any]:	1✔
356	"""
357	Serializes the component to a dictionary.
358	"""
359	serialized = default_to_dict(	1✔
360	self,
361	split_by=self.split_by,
362	split_length=self.split_length,
363	split_overlap=self.split_overlap,
364	split_threshold=self.split_threshold,
365	)
366	if self.splitting_function:	1✔
367	serialized["init_parameters"]["splitting_function"] = serialize_callable(self.splitting_function)	1✔
368	return serialized	1✔
369
370	@classmethod	1✔
371	def from_dict(cls, data: Dict[str, Any]) -> "DocumentSplitter":	1✔
372	"""
373	Deserializes the component from a dictionary.
374	"""
375	init_params = data.get("init_parameters", {})	1✔
376
377	splitting_function = init_params.get("splitting_function", None)	1✔
378	if splitting_function:	1✔
379	init_params["splitting_function"] = deserialize_callable(splitting_function)	1✔
380
381	return default_from_dict(cls, data)	1✔
382
383	def _concatenate_sentences_based_on_word_amount(	1✔
384	self, sentences: List[str], split_length: int, split_overlap: int
385	) -> Tuple[List[str], List[int], List[int]]:
386	"""
387	Groups the sentences into chunks of `split_length` words while respecting sentence boundaries.
388
389	This function is only used when splitting by `word` and `respect_sentence_boundary` is set to `True`, i.e.:
390	with NLTK sentence tokenizer.
391
392	:param sentences: The list of sentences to split.
393	:param split_length: The maximum number of words in each split.
394	:param split_overlap: The number of overlapping words in each split.
395	:returns: A tuple containing the concatenated sentences, the start page numbers, and the start indices.
396	"""
397	# Chunk information
398	chunk_word_count = 0	1✔
399	chunk_starting_page_number = 1	1✔
400	chunk_start_idx = 0	1✔
401	current_chunk: List[str] = []	1✔
402	# Output lists
403	split_start_page_numbers = []	1✔
404	list_of_splits: List[List[str]] = []	1✔
405	split_start_indices = []	1✔
406
407	for sentence_idx, sentence in enumerate(sentences):	1✔
408	current_chunk.append(sentence)	1✔
409	chunk_word_count += len(sentence.split())	1✔
410	next_sentence_word_count = (	1✔
411	len(sentences[sentence_idx + 1].split()) if sentence_idx < len(sentences) - 1 else 0
412	)
413
414	# Number of words in the current chunk plus the next sentence is larger than the split_length,
415	# or we reached the last sentence
416	if (chunk_word_count + next_sentence_word_count) > split_length or sentence_idx == len(sentences) - 1:	1✔
417	# Save current chunk and start a new one
418	list_of_splits.append(current_chunk)	1✔
419	split_start_page_numbers.append(chunk_starting_page_number)	1✔
420	split_start_indices.append(chunk_start_idx)	1✔
421
422	# Get the number of sentences that overlap with the next chunk
423	num_sentences_to_keep = DocumentSplitter._number_of_sentences_to_keep(	1✔
424	sentences=current_chunk, split_length=split_length, split_overlap=split_overlap
425	)
426	# Set up information for the new chunk
427	if num_sentences_to_keep > 0:	1✔
428	# Processed sentences are the ones that are not overlapping with the next chunk
429	processed_sentences = current_chunk[:-num_sentences_to_keep]	1✔
430	chunk_starting_page_number += sum(sent.count("\f") for sent in processed_sentences)	1✔
431	chunk_start_idx += len("".join(processed_sentences))	1✔
432	# Next chunk starts with the sentences that were overlapping with the previous chunk
433	current_chunk = current_chunk[-num_sentences_to_keep:]	1✔
434	chunk_word_count = sum(len(s.split()) for s in current_chunk)	1✔
435	else:
436	# Here processed_sentences is the same as current_chunk since there is no overlap
437	chunk_starting_page_number += sum(sent.count("\f") for sent in current_chunk)	1✔
438	chunk_start_idx += len("".join(current_chunk))	1✔
439	current_chunk = []	1✔
440	chunk_word_count = 0	1✔
441
442	# Concatenate the sentences together within each split
443	text_splits = []	1✔
444	for split in list_of_splits:	1✔
445	text = "".join(split)	1✔
446	if len(text) > 0:	1✔
447	text_splits.append(text)	1✔
448
449	return text_splits, split_start_page_numbers, split_start_indices	1✔
450
451	@staticmethod	1✔
452	def _number_of_sentences_to_keep(sentences: List[str], split_length: int, split_overlap: int) -> int:	1✔
453	"""
454	Returns the number of sentences to keep in the next chunk based on the `split_overlap` and `split_length`.
455
456	:param sentences: The list of sentences to split.
457	:param split_length: The maximum number of words in each split.
458	:param split_overlap: The number of overlapping words in each split.
459	:returns: The number of sentences to keep in the next chunk.
460	"""
461	# If the split_overlap is 0, we don't need to keep any sentences
462	if split_overlap == 0:	1✔
463	return 0	1✔
464
465	num_sentences_to_keep = 0	1✔
466	num_words = 0	1✔
467	# Next overlapping Document should not start exactly the same as the previous one, so we skip the first sentence
468	for sent in reversed(sentences[1:]):	1✔
469	num_words += len(sent.split())	1✔
470	# If the number of words is larger than the split_length then don't add any more sentences
471	if num_words > split_length:	1✔
472	break	1✔
473	num_sentences_to_keep += 1	1✔
474	if num_words > split_overlap:	1✔
475	break	1✔
476	return num_sentences_to_keep	1✔

deepset-ai / haystack / 12253750863

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous