8877339957

Committed 29 Apr 2024 10:51AM UTC coverage: 90.12% (+0.02%) from 90.096%

Build # 8877339957

Build Type

push

github

Committed by

web-flow

Commit Message

feat: add page_number to metadata in DocumentSplitter (#7599)

* Add the implementation for page counting used in the v1.25.x branch. It should work as expected in issue #6705.

* Add tests that reflect the desired behabiour. This behabiour is inffered from the one it had on Haystack 1.x
Solve some minor bugs spotted by tests.

* Update docstrings.

* Add reno.

* Update haystack/components/preprocessors/document_splitter.py

Update docstring from suggestion

Co-authored-by: David S. Batista <dsbatista@gmail.com>

* solve suggestion to improve readability

* fragment tests

* Update haystack/components/preprocessors/document_splitter.py

Co-authored-by: David S. Batista <dsbatista@gmail.com>

* Update .gitignore

* Update .gitignore

* Update add-page-number-to-document-splitter-162e9dc7443575f0.yaml

* blackening

---------

Co-authored-by: David S. Batista <dsbatista@gmail.com>

Run Details

6330 of 7024 relevant lines covered (90.12%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.57

haystack/components/preprocessors/document_splitter.py

from copy import deepcopy
from typing import Dict, List, Literal, Tuple

from more_itertools import windowed

from haystack import Document, component


@component
class DocumentSplitter:
    """
    Splits a list of text documents into a list of text documents with shorter texts.

    Splitting documents with long texts is a common preprocessing step during indexing.
    This allows Embedders to create significant semantic representations
    and avoids exceeding the maximum context length of language models.
    """

    def __init__(
        self,
        split_by: Literal["word", "sentence", "page", "passage"] = "word",
        split_length: int = 200,
        split_overlap: int = 0,
    ):
        """
        Initialize the DocumentSplitter.

        :param split_by: The unit by which the document should be split. Choose from "word" for splitting by " ",
            "sentence" for splitting by ".", "page" for splitting by "\\f" or "passage" for splitting by "\\n\\n".
        :param split_length: The maximum number of units in each split.
        :param split_overlap: The number of units that each split should overlap.
        """

        self.split_by = split_by
        if split_by not in ["word", "sentence", "page", "passage"]:
            raise ValueError("split_by must be one of 'word', 'sentence', 'page' or 'passage'.")
        if split_length <= 0:
            raise ValueError("split_length must be greater than 0.")
        self.split_length = split_length
        if split_overlap < 0:
            raise ValueError("split_overlap must be greater than or equal to 0.")
        self.split_overlap = split_overlap

    @component.output_types(documents=List[Document])
    def run(self, documents: List[Document]):
        """
        Split documents into smaller parts.

        Splits documents by the unit expressed in `split_by`, with a length of `split_length`
        and an overlap of `split_overlap`.

        :param documents: The documents to split.

        :returns: A dictionary with the following key:
            - `documents`: List of documents with the split texts. A metadata field "source_id" is added to each
            document to keep track of the original document that was split. Another metadata field "page_number" is added to each number to keep track of the page it belonged to in the original document. Other metadata are copied from the original
            document.

        :raises TypeError: if the input is not a list of Documents.
        :raises ValueError: if the content of a document is None.
        """

        if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
            raise TypeError("DocumentSplitter expects a List of Documents as input.")

        split_docs = []
        for doc in documents:
            if doc.content is None:
                raise ValueError(
                    f"DocumentSplitter only works with text documents but document.content for document ID {doc.id} is None."
                )
            units = self._split_into_units(doc.content, self.split_by)
            text_splits, splits_pages = self._concatenate_units(units, self.split_length, self.split_overlap)
            metadata = deepcopy(doc.meta)
            metadata["source_id"] = doc.id
            split_docs += self._create_docs_from_splits(
                text_splits=text_splits, splits_pages=splits_pages, meta=metadata
            )
        return {"documents": split_docs}

    def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]:
        if split_by == "page":
            split_at = "\f"
        elif split_by == "passage":
            split_at = "\n\n"
        elif split_by == "sentence":
            split_at = "."
        elif split_by == "word":
            split_at = " "
        else:
            raise NotImplementedError(
                "DocumentSplitter only supports 'word', 'sentence', 'page' or 'passage' split_by options."
            )
        units = text.split(split_at)
        # Add the delimiter back to all units except the last one
        for i in range(len(units) - 1):
            units[i] += split_at
        return units

    def _concatenate_units(
        self, elements: List[str], split_length: int, split_overlap: int
    ) -> Tuple[List[str], List[int]]:
        """
        Concatenates the elements into parts of split_length units keeping track of the original page number that each element belongs.
        """
        text_splits = []
        splits_pages = []
        cur_page = 1
        segments = windowed(elements, n=split_length, step=split_length - split_overlap)
        for seg in segments:
            current_units = [unit for unit in seg if unit is not None]
            txt = "".join(current_units)
            if len(txt) > 0:
                text_splits.append(txt)
                splits_pages.append(cur_page)
                processed_units = current_units[: split_length - split_overlap]
                if self.split_by == "page":
                    num_page_breaks = len(processed_units)
                else:
                    num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
                cur_page += num_page_breaks
        return text_splits, splits_pages

    @staticmethod
    def _create_docs_from_splits(text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]:
        """
        Creates Document objects from text splits enriching them with page number and the metadata of the original document.
        """
        documents: List[Document] = []

        for i, txt in enumerate(text_splits):
            meta = deepcopy(meta)
            doc = Document(content=txt, meta=meta)
            doc.meta["page_number"] = splits_pages[i]
            documents.append(doc)
        return documents

1	from copy import deepcopy	1✔
2	from typing import Dict, List, Literal, Tuple	1✔
3
4	from more_itertools import windowed	1✔
5
6	from haystack import Document, component	1✔
7
8
9	@component	1✔
10	class DocumentSplitter:	1✔
11	"""
12	Splits a list of text documents into a list of text documents with shorter texts.
13
14	Splitting documents with long texts is a common preprocessing step during indexing.
15	This allows Embedders to create significant semantic representations
16	and avoids exceeding the maximum context length of language models.
17	"""
18
19	def __init__(	1✔
20	self,
21	split_by: Literal["word", "sentence", "page", "passage"] = "word",
22	split_length: int = 200,
23	split_overlap: int = 0,
24	):
25	"""
26	Initialize the DocumentSplitter.
27
28	:param split_by: The unit by which the document should be split. Choose from "word" for splitting by " ",
29	"sentence" for splitting by ".", "page" for splitting by "\\f" or "passage" for splitting by "\\n\\n".
30	:param split_length: The maximum number of units in each split.
31	:param split_overlap: The number of units that each split should overlap.
32	"""
33
34	self.split_by = split_by	1✔
35	if split_by not in ["word", "sentence", "page", "passage"]:	1✔
36	raise ValueError("split_by must be one of 'word', 'sentence', 'page' or 'passage'.")	1✔
37	if split_length <= 0:	1✔
38	raise ValueError("split_length must be greater than 0.")	1✔
39	self.split_length = split_length	1✔
40	if split_overlap < 0:	1✔
41	raise ValueError("split_overlap must be greater than or equal to 0.")	1✔
42	self.split_overlap = split_overlap	1✔
43
44	@component.output_types(documents=List[Document])	1✔
45	def run(self, documents: List[Document]):	1✔
46	"""
47	Split documents into smaller parts.
48
49	Splits documents by the unit expressed in `split_by`, with a length of `split_length`
50	and an overlap of `split_overlap`.
51
52	:param documents: The documents to split.
53
54	:returns: A dictionary with the following key:
55	- `documents`: List of documents with the split texts. A metadata field "source_id" is added to each
56	document to keep track of the original document that was split. Another metadata field "page_number" is added to each number to keep track of the page it belonged to in the original document. Other metadata are copied from the original
57	document.
58
59	:raises TypeError: if the input is not a list of Documents.
60	:raises ValueError: if the content of a document is None.
61	"""
62
63	if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):	1✔
64	raise TypeError("DocumentSplitter expects a List of Documents as input.")	1✔
65
66	split_docs = []	1✔
67	for doc in documents:	1✔
68	if doc.content is None:	1✔
69	raise ValueError(	1✔
70	f"DocumentSplitter only works with text documents but document.content for document ID {doc.id} is None."
71	)
72	units = self._split_into_units(doc.content, self.split_by)	1✔
73	text_splits, splits_pages = self._concatenate_units(units, self.split_length, self.split_overlap)	1✔
74	metadata = deepcopy(doc.meta)	1✔
75	metadata["source_id"] = doc.id	1✔
76	split_docs += self._create_docs_from_splits(	1✔
77	text_splits=text_splits, splits_pages=splits_pages, meta=metadata
78	)
79	return {"documents": split_docs}	1✔
80
81	def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]:	1✔
82	if split_by == "page":	1✔
83	split_at = "\f"	1✔
84	elif split_by == "passage":	1✔
85	split_at = "\n\n"	1✔
86	elif split_by == "sentence":	1✔
87	split_at = "."	1✔
88	elif split_by == "word":	1✔
89	split_at = " "	1✔
90	else:
91	raise NotImplementedError(	×
92	"DocumentSplitter only supports 'word', 'sentence', 'page' or 'passage' split_by options."
93	)
94	units = text.split(split_at)	1✔
95	# Add the delimiter back to all units except the last one
96	for i in range(len(units) - 1):	1✔
97	units[i] += split_at	1✔
98	return units	1✔
99
100	def _concatenate_units(	1✔
101	self, elements: List[str], split_length: int, split_overlap: int
102	) -> Tuple[List[str], List[int]]:
103	"""
104	Concatenates the elements into parts of split_length units keeping track of the original page number that each element belongs.
105	"""
106	text_splits = []	1✔
107	splits_pages = []	1✔
108	cur_page = 1	1✔
109	segments = windowed(elements, n=split_length, step=split_length - split_overlap)	1✔
110	for seg in segments:	1✔
111	current_units = [unit for unit in seg if unit is not None]	1✔
112	txt = "".join(current_units)	1✔
113	if len(txt) > 0:	1✔
114	text_splits.append(txt)	1✔
115	splits_pages.append(cur_page)	1✔
116	processed_units = current_units[: split_length - split_overlap]	1✔
117	if self.split_by == "page":	1✔
118	num_page_breaks = len(processed_units)	1✔
119	else:
120	num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)	1✔
121	cur_page += num_page_breaks	1✔
122	return text_splits, splits_pages	1✔
123
124	@staticmethod	1✔
125	def _create_docs_from_splits(text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]:	1✔
126	"""
127	Creates Document objects from text splits enriching them with page number and the metadata of the original document.
128	"""
129	documents: List[Document] = []	1✔
130
131	for i, txt in enumerate(text_splits):	1✔
132	meta = deepcopy(meta)	1✔
133	doc = Document(content=txt, meta=meta)	1✔
134	doc.meta["page_number"] = splits_pages[i]	1✔
135	documents.append(doc)	1✔
136	return documents	1✔

deepset-ai / haystack / 8877339957

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous