• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 8877339957

29 Apr 2024 10:51AM UTC coverage: 90.12% (+0.02%) from 90.096%
8877339957

push

github

web-flow
feat: add page_number to metadata in DocumentSplitter (#7599)

* Add the implementation for page counting used in the v1.25.x branch. It should work as expected in issue #6705.

* Add tests that reflect the desired behabiour. This behabiour is inffered from the one it had on Haystack 1.x
Solve some minor bugs spotted by tests.

* Update docstrings.

* Add reno.

* Update haystack/components/preprocessors/document_splitter.py

Update docstring from suggestion

Co-authored-by: David S. Batista <dsbatista@gmail.com>

* solve suggestion to improve readability

* fragment tests

* Update haystack/components/preprocessors/document_splitter.py

Co-authored-by: David S. Batista <dsbatista@gmail.com>

* Update .gitignore

* Update .gitignore

* Update add-page-number-to-document-splitter-162e9dc7443575f0.yaml

* blackening

---------

Co-authored-by: David S. Batista <dsbatista@gmail.com>

6330 of 7024 relevant lines covered (90.12%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.57
haystack/components/preprocessors/document_splitter.py
1
from copy import deepcopy
1✔
2
from typing import Dict, List, Literal, Tuple
1✔
3

4
from more_itertools import windowed
1✔
5

6
from haystack import Document, component
1✔
7

8

9
@component
1✔
10
class DocumentSplitter:
1✔
11
    """
12
    Splits a list of text documents into a list of text documents with shorter texts.
13

14
    Splitting documents with long texts is a common preprocessing step during indexing.
15
    This allows Embedders to create significant semantic representations
16
    and avoids exceeding the maximum context length of language models.
17
    """
18

19
    def __init__(
1✔
20
        self,
21
        split_by: Literal["word", "sentence", "page", "passage"] = "word",
22
        split_length: int = 200,
23
        split_overlap: int = 0,
24
    ):
25
        """
26
        Initialize the DocumentSplitter.
27

28
        :param split_by: The unit by which the document should be split. Choose from "word" for splitting by " ",
29
            "sentence" for splitting by ".", "page" for splitting by "\\f" or "passage" for splitting by "\\n\\n".
30
        :param split_length: The maximum number of units in each split.
31
        :param split_overlap: The number of units that each split should overlap.
32
        """
33

34
        self.split_by = split_by
1✔
35
        if split_by not in ["word", "sentence", "page", "passage"]:
1✔
36
            raise ValueError("split_by must be one of 'word', 'sentence', 'page' or 'passage'.")
1✔
37
        if split_length <= 0:
1✔
38
            raise ValueError("split_length must be greater than 0.")
1✔
39
        self.split_length = split_length
1✔
40
        if split_overlap < 0:
1✔
41
            raise ValueError("split_overlap must be greater than or equal to 0.")
1✔
42
        self.split_overlap = split_overlap
1✔
43

44
    @component.output_types(documents=List[Document])
1✔
45
    def run(self, documents: List[Document]):
1✔
46
        """
47
        Split documents into smaller parts.
48

49
        Splits documents by the unit expressed in `split_by`, with a length of `split_length`
50
        and an overlap of `split_overlap`.
51

52
        :param documents: The documents to split.
53

54
        :returns: A dictionary with the following key:
55
            - `documents`: List of documents with the split texts. A metadata field "source_id" is added to each
56
            document to keep track of the original document that was split. Another metadata field "page_number" is added to each number to keep track of the page it belonged to in the original document. Other metadata are copied from the original
57
            document.
58

59
        :raises TypeError: if the input is not a list of Documents.
60
        :raises ValueError: if the content of a document is None.
61
        """
62

63
        if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
1✔
64
            raise TypeError("DocumentSplitter expects a List of Documents as input.")
1✔
65

66
        split_docs = []
1✔
67
        for doc in documents:
1✔
68
            if doc.content is None:
1✔
69
                raise ValueError(
1✔
70
                    f"DocumentSplitter only works with text documents but document.content for document ID {doc.id} is None."
71
                )
72
            units = self._split_into_units(doc.content, self.split_by)
1✔
73
            text_splits, splits_pages = self._concatenate_units(units, self.split_length, self.split_overlap)
1✔
74
            metadata = deepcopy(doc.meta)
1✔
75
            metadata["source_id"] = doc.id
1✔
76
            split_docs += self._create_docs_from_splits(
1✔
77
                text_splits=text_splits, splits_pages=splits_pages, meta=metadata
78
            )
79
        return {"documents": split_docs}
1✔
80

81
    def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]:
1✔
82
        if split_by == "page":
1✔
83
            split_at = "\f"
1✔
84
        elif split_by == "passage":
1✔
85
            split_at = "\n\n"
1✔
86
        elif split_by == "sentence":
1✔
87
            split_at = "."
1✔
88
        elif split_by == "word":
1✔
89
            split_at = " "
1✔
90
        else:
91
            raise NotImplementedError(
×
92
                "DocumentSplitter only supports 'word', 'sentence', 'page' or 'passage' split_by options."
93
            )
94
        units = text.split(split_at)
1✔
95
        # Add the delimiter back to all units except the last one
96
        for i in range(len(units) - 1):
1✔
97
            units[i] += split_at
1✔
98
        return units
1✔
99

100
    def _concatenate_units(
1✔
101
        self, elements: List[str], split_length: int, split_overlap: int
102
    ) -> Tuple[List[str], List[int]]:
103
        """
104
        Concatenates the elements into parts of split_length units keeping track of the original page number that each element belongs.
105
        """
106
        text_splits = []
1✔
107
        splits_pages = []
1✔
108
        cur_page = 1
1✔
109
        segments = windowed(elements, n=split_length, step=split_length - split_overlap)
1✔
110
        for seg in segments:
1✔
111
            current_units = [unit for unit in seg if unit is not None]
1✔
112
            txt = "".join(current_units)
1✔
113
            if len(txt) > 0:
1✔
114
                text_splits.append(txt)
1✔
115
                splits_pages.append(cur_page)
1✔
116
                processed_units = current_units[: split_length - split_overlap]
1✔
117
                if self.split_by == "page":
1✔
118
                    num_page_breaks = len(processed_units)
1✔
119
                else:
120
                    num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
1✔
121
                cur_page += num_page_breaks
1✔
122
        return text_splits, splits_pages
1✔
123

124
    @staticmethod
1✔
125
    def _create_docs_from_splits(text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]:
1✔
126
        """
127
        Creates Document objects from text splits enriching them with page number and the metadata of the original document.
128
        """
129
        documents: List[Document] = []
1✔
130

131
        for i, txt in enumerate(text_splits):
1✔
132
            meta = deepcopy(meta)
1✔
133
            doc = Document(content=txt, meta=meta)
1✔
134
            doc.meta["page_number"] = splits_pages[i]
1✔
135
            documents.append(doc)
1✔
136
        return documents
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc