• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 9568249476

18 Jun 2024 03:52PM UTC coverage: 89.872% (-0.1%) from 89.995%
9568249476

push

github

web-flow
ci: Add code formatting checks  (#7882)

* ruff settings

enable ruff format and re-format outdated files

feat: `EvaluationRunResult` add parameter to specify columns to keep in the comparative `Dataframe`  (#7879)

* adding param to explictily state which cols to keep

* adding param to explictily state which cols to keep

* adding param to explictily state which cols to keep

* updating tests

* adding release notes

* Update haystack/evaluation/eval_run_result.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update releasenotes/notes/add-keep-columns-to-EvalRunResult-comparative-be3e15ce45de3e0b.yaml

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* updating docstring

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

add format-check

fail on format and linting failures

fix string formatting

reformat long lines

fix tests

fix typing

linter

pull from main

* reformat

* lint -> check

* lint -> check

6957 of 7741 relevant lines covered (89.87%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.63
haystack/components/preprocessors/document_splitter.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
from copy import deepcopy
1✔
6
from typing import Dict, List, Literal, Tuple
1✔
7

8
from more_itertools import windowed
1✔
9

10
from haystack import Document, component
1✔
11

12

13
@component
1✔
14
class DocumentSplitter:
1✔
15
    """
16
    Splits a list of text documents into a list of text documents with shorter texts.
17

18
    Splitting documents with long texts is a common preprocessing step during indexing.
19
    This allows Embedders to create significant semantic representations
20
    and avoids exceeding the maximum context length of language models.
21

22
    Usage example:
23
    ```python
24
    from haystack import Document
25
    from haystack.components.preprocessors import DocumentSplitter
26

27
    doc = Document(content="Moonlight shimmered softly, wolves howled nearby, night enveloped everything.")
28

29
    splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=0)
30
    result = splitter.run(documents=[doc])
31
    ```
32
    """
33

34
    def __init__(
1✔
35
        self,
36
        split_by: Literal["word", "sentence", "page", "passage"] = "word",
37
        split_length: int = 200,
38
        split_overlap: int = 0,
39
        split_threshold: int = 0,
40
    ):
41
        """
42
        Initialize the DocumentSplitter.
43

44
        :param split_by: The unit by which the document should be split. Choose from "word" for splitting by " ",
45
            "sentence" for splitting by ".", "page" for splitting by "\\f" or "passage" for splitting by "\\n\\n".
46
        :param split_length: The maximum number of units in each split.
47
        :param split_overlap: The number of units that each split should overlap.
48
        :param split_threshold: The minimum number of units that the split should have. If the split has fewer units
49
            than the threshold, it will be attached to the previous split.
50
        """
51

52
        self.split_by = split_by
1✔
53
        if split_by not in ["word", "sentence", "page", "passage"]:
1✔
54
            raise ValueError("split_by must be one of 'word', 'sentence', 'page' or 'passage'.")
1✔
55
        if split_length <= 0:
1✔
56
            raise ValueError("split_length must be greater than 0.")
1✔
57
        self.split_length = split_length
1✔
58
        if split_overlap < 0:
1✔
59
            raise ValueError("split_overlap must be greater than or equal to 0.")
1✔
60
        self.split_overlap = split_overlap
1✔
61
        self.split_threshold = split_threshold
1✔
62

63
    @component.output_types(documents=List[Document])
1✔
64
    def run(self, documents: List[Document]):
1✔
65
        """
66
        Split documents into smaller parts.
67

68
        Splits documents by the unit expressed in `split_by`, with a length of `split_length`
69
        and an overlap of `split_overlap`.
70

71
        :param documents: The documents to split.
72

73
        :returns: A dictionary with the following key:
74
            - `documents`: List of documents with the split texts. A metadata field "source_id" is added to each
75
            document to keep track of the original document that was split. Another metadata field "page_number"
76
            is added to each number to keep track of the page it belonged to in the original document. Other metadata
77
            are copied from the original document.
78

79
        :raises TypeError: if the input is not a list of Documents.
80
        :raises ValueError: if the content of a document is None.
81
        """
82

83
        if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
1✔
84
            raise TypeError("DocumentSplitter expects a List of Documents as input.")
1✔
85

86
        split_docs = []
1✔
87
        for doc in documents:
1✔
88
            if doc.content is None:
1✔
89
                raise ValueError(
1✔
90
                    f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None."
91
                )
92
            units = self._split_into_units(doc.content, self.split_by)
1✔
93
            text_splits, splits_pages = self._concatenate_units(
1✔
94
                units, self.split_length, self.split_overlap, self.split_threshold
95
            )
96
            metadata = deepcopy(doc.meta)
1✔
97
            metadata["source_id"] = doc.id
1✔
98
            split_docs += self._create_docs_from_splits(
1✔
99
                text_splits=text_splits, splits_pages=splits_pages, meta=metadata
100
            )
101
        return {"documents": split_docs}
1✔
102

103
    def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]:
1✔
104
        if split_by == "page":
1✔
105
            split_at = "\f"
1✔
106
        elif split_by == "passage":
1✔
107
            split_at = "\n\n"
1✔
108
        elif split_by == "sentence":
1✔
109
            split_at = "."
1✔
110
        elif split_by == "word":
1✔
111
            split_at = " "
1✔
112
        else:
113
            raise NotImplementedError(
×
114
                "DocumentSplitter only supports 'word', 'sentence', 'page' or 'passage' split_by options."
115
            )
116
        units = text.split(split_at)
1✔
117
        # Add the delimiter back to all units except the last one
118
        for i in range(len(units) - 1):
1✔
119
            units[i] += split_at
1✔
120
        return units
1✔
121

122
    def _concatenate_units(
1✔
123
        self, elements: List[str], split_length: int, split_overlap: int, split_threshold: int
124
    ) -> Tuple[List[str], List[int]]:
125
        """
126
        Concatenates the elements into parts of split_length units.
127

128
        Keeps track of the original page number that each element belongs. If the length of the current units is less
129
        than the pre-defined `split_threshold`, it does not create a new split. Instead, it concatenates the current
130
        units with the last split, preventing the creation of excessively small splits.
131
        """
132

133
        text_splits: List[str] = []
1✔
134
        splits_pages = []
1✔
135
        cur_page = 1
1✔
136
        segments = windowed(elements, n=split_length, step=split_length - split_overlap)
1✔
137
        for seg in segments:
1✔
138
            current_units = [unit for unit in seg if unit is not None]
1✔
139
            txt = "".join(current_units)
1✔
140
            # check if length of current units is below split_threshold
141
            if len(current_units) < split_threshold and len(text_splits) > 0:
1✔
142
                # concatenate the last split with the current one
143
                text_splits[-1] += txt
1✔
144
            elif len(txt) > 0:
1✔
145
                text_splits.append(txt)
1✔
146
                splits_pages.append(cur_page)
1✔
147
            processed_units = current_units[: split_length - split_overlap]
1✔
148
            if self.split_by == "page":
1✔
149
                num_page_breaks = len(processed_units)
1✔
150
            else:
151
                num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
1✔
152
            cur_page += num_page_breaks
1✔
153
        return text_splits, splits_pages
1✔
154

155
    @staticmethod
1✔
156
    def _create_docs_from_splits(text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]:
1✔
157
        """
158
        Creates Document objects from splits enriching them with page number and the metadata of the original document.
159
        """
160
        documents: List[Document] = []
1✔
161

162
        for i, txt in enumerate(text_splits):
1✔
163
            meta = deepcopy(meta)
1✔
164
            doc = Document(content=txt, meta=meta)
1✔
165
            doc.meta["page_number"] = splits_pages[i]
1✔
166
            documents.append(doc)
1✔
167
        return documents
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc