9568249476

Committed 18 Jun 2024 03:52PM UTC coverage: 89.872% (-0.1%) from 89.995%

Build # 9568249476

Build Type

push

github

Committed by

web-flow

Commit Message

ci: Add code formatting checks  (#7882)

* ruff settings

enable ruff format and re-format outdated files

feat: `EvaluationRunResult` add parameter to specify columns to keep in the comparative `Dataframe`  (#7879)

* adding param to explictily state which cols to keep

* adding param to explictily state which cols to keep

* adding param to explictily state which cols to keep

* updating tests

* adding release notes

* Update haystack/evaluation/eval_run_result.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update releasenotes/notes/add-keep-columns-to-EvalRunResult-comparative-be3e15ce45de3e0b.yaml

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* updating docstring

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

add format-check

fail on format and linting failures

fix string formatting

reformat long lines

fix tests

fix typing

linter

pull from main

* reformat

* lint -> check

* lint -> check

Run Details

6957 of 7741 relevant lines covered (89.87%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.63

haystack/components/preprocessors/document_splitter.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from copy import deepcopy
from typing import Dict, List, Literal, Tuple

from more_itertools import windowed

from haystack import Document, component


@component
class DocumentSplitter:
    """
    Splits a list of text documents into a list of text documents with shorter texts.

    Splitting documents with long texts is a common preprocessing step during indexing.
    This allows Embedders to create significant semantic representations
    and avoids exceeding the maximum context length of language models.

    Usage example:
    ```python
    from haystack import Document
    from haystack.components.preprocessors import DocumentSplitter

    doc = Document(content="Moonlight shimmered softly, wolves howled nearby, night enveloped everything.")

    splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=0)
    result = splitter.run(documents=[doc])
    ```
    """

    def __init__(
        self,
        split_by: Literal["word", "sentence", "page", "passage"] = "word",
        split_length: int = 200,
        split_overlap: int = 0,
        split_threshold: int = 0,
    ):
        """
        Initialize the DocumentSplitter.

        :param split_by: The unit by which the document should be split. Choose from "word" for splitting by " ",
            "sentence" for splitting by ".", "page" for splitting by "\\f" or "passage" for splitting by "\\n\\n".
        :param split_length: The maximum number of units in each split.
        :param split_overlap: The number of units that each split should overlap.
        :param split_threshold: The minimum number of units that the split should have. If the split has fewer units
            than the threshold, it will be attached to the previous split.
        """

        self.split_by = split_by
        if split_by not in ["word", "sentence", "page", "passage"]:
            raise ValueError("split_by must be one of 'word', 'sentence', 'page' or 'passage'.")
        if split_length <= 0:
            raise ValueError("split_length must be greater than 0.")
        self.split_length = split_length
        if split_overlap < 0:
            raise ValueError("split_overlap must be greater than or equal to 0.")
        self.split_overlap = split_overlap
        self.split_threshold = split_threshold

    @component.output_types(documents=List[Document])
    def run(self, documents: List[Document]):
        """
        Split documents into smaller parts.

        Splits documents by the unit expressed in `split_by`, with a length of `split_length`
        and an overlap of `split_overlap`.

        :param documents: The documents to split.

        :returns: A dictionary with the following key:
            - `documents`: List of documents with the split texts. A metadata field "source_id" is added to each
            document to keep track of the original document that was split. Another metadata field "page_number"
            is added to each number to keep track of the page it belonged to in the original document. Other metadata
            are copied from the original document.

        :raises TypeError: if the input is not a list of Documents.
        :raises ValueError: if the content of a document is None.
        """

        if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
            raise TypeError("DocumentSplitter expects a List of Documents as input.")

        split_docs = []
        for doc in documents:
            if doc.content is None:
                raise ValueError(
                    f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None."
                )
            units = self._split_into_units(doc.content, self.split_by)
            text_splits, splits_pages = self._concatenate_units(
                units, self.split_length, self.split_overlap, self.split_threshold
            )
            metadata = deepcopy(doc.meta)
            metadata["source_id"] = doc.id
            split_docs += self._create_docs_from_splits(
                text_splits=text_splits, splits_pages=splits_pages, meta=metadata
            )
        return {"documents": split_docs}

    def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]:
        if split_by == "page":
            split_at = "\f"
        elif split_by == "passage":
            split_at = "\n\n"
        elif split_by == "sentence":
            split_at = "."
        elif split_by == "word":
            split_at = " "
        else:
            raise NotImplementedError(
                "DocumentSplitter only supports 'word', 'sentence', 'page' or 'passage' split_by options."
            )
        units = text.split(split_at)
        # Add the delimiter back to all units except the last one
        for i in range(len(units) - 1):
            units[i] += split_at
        return units

    def _concatenate_units(
        self, elements: List[str], split_length: int, split_overlap: int, split_threshold: int
    ) -> Tuple[List[str], List[int]]:
        """
        Concatenates the elements into parts of split_length units.

        Keeps track of the original page number that each element belongs. If the length of the current units is less
        than the pre-defined `split_threshold`, it does not create a new split. Instead, it concatenates the current
        units with the last split, preventing the creation of excessively small splits.
        """

        text_splits: List[str] = []
        splits_pages = []
        cur_page = 1
        segments = windowed(elements, n=split_length, step=split_length - split_overlap)
        for seg in segments:
            current_units = [unit for unit in seg if unit is not None]
            txt = "".join(current_units)
            # check if length of current units is below split_threshold
            if len(current_units) < split_threshold and len(text_splits) > 0:
                # concatenate the last split with the current one
                text_splits[-1] += txt
            elif len(txt) > 0:
                text_splits.append(txt)
                splits_pages.append(cur_page)
            processed_units = current_units[: split_length - split_overlap]
            if self.split_by == "page":
                num_page_breaks = len(processed_units)
            else:
                num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
            cur_page += num_page_breaks
        return text_splits, splits_pages

    @staticmethod
    def _create_docs_from_splits(text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]:
        """
        Creates Document objects from splits enriching them with page number and the metadata of the original document.
        """
        documents: List[Document] = []

        for i, txt in enumerate(text_splits):
            meta = deepcopy(meta)
            doc = Document(content=txt, meta=meta)
            doc.meta["page_number"] = splits_pages[i]
            documents.append(doc)
        return documents

1	# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2	#
3	# SPDX-License-Identifier: Apache-2.0
4
5	from copy import deepcopy	1✔
6	from typing import Dict, List, Literal, Tuple	1✔
7
8	from more_itertools import windowed	1✔
9
10	from haystack import Document, component	1✔
11
12
13	@component	1✔
14	class DocumentSplitter:	1✔
15	"""
16	Splits a list of text documents into a list of text documents with shorter texts.
17
18	Splitting documents with long texts is a common preprocessing step during indexing.
19	This allows Embedders to create significant semantic representations
20	and avoids exceeding the maximum context length of language models.
21
22	Usage example:
23	```python
24	from haystack import Document
25	from haystack.components.preprocessors import DocumentSplitter
26
27	doc = Document(content="Moonlight shimmered softly, wolves howled nearby, night enveloped everything.")
28
29	splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=0)
30	result = splitter.run(documents=[doc])
31	```
32	"""
33
34	def __init__(	1✔
35	self,
36	split_by: Literal["word", "sentence", "page", "passage"] = "word",
37	split_length: int = 200,
38	split_overlap: int = 0,
39	split_threshold: int = 0,
40	):
41	"""
42	Initialize the DocumentSplitter.
43
44	:param split_by: The unit by which the document should be split. Choose from "word" for splitting by " ",
45	"sentence" for splitting by ".", "page" for splitting by "\\f" or "passage" for splitting by "\\n\\n".
46	:param split_length: The maximum number of units in each split.
47	:param split_overlap: The number of units that each split should overlap.
48	:param split_threshold: The minimum number of units that the split should have. If the split has fewer units
49	than the threshold, it will be attached to the previous split.
50	"""
51
52	self.split_by = split_by	1✔
53	if split_by not in ["word", "sentence", "page", "passage"]:	1✔
54	raise ValueError("split_by must be one of 'word', 'sentence', 'page' or 'passage'.")	1✔
55	if split_length <= 0:	1✔
56	raise ValueError("split_length must be greater than 0.")	1✔
57	self.split_length = split_length	1✔
58	if split_overlap < 0:	1✔
59	raise ValueError("split_overlap must be greater than or equal to 0.")	1✔
60	self.split_overlap = split_overlap	1✔
61	self.split_threshold = split_threshold	1✔
62
63	@component.output_types(documents=List[Document])	1✔
64	def run(self, documents: List[Document]):	1✔
65	"""
66	Split documents into smaller parts.
67
68	Splits documents by the unit expressed in `split_by`, with a length of `split_length`
69	and an overlap of `split_overlap`.
70
71	:param documents: The documents to split.
72
73	:returns: A dictionary with the following key:
74	- `documents`: List of documents with the split texts. A metadata field "source_id" is added to each
75	document to keep track of the original document that was split. Another metadata field "page_number"
76	is added to each number to keep track of the page it belonged to in the original document. Other metadata
77	are copied from the original document.
78
79	:raises TypeError: if the input is not a list of Documents.
80	:raises ValueError: if the content of a document is None.
81	"""
82
83	if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):	1✔
84	raise TypeError("DocumentSplitter expects a List of Documents as input.")	1✔
85
86	split_docs = []	1✔
87	for doc in documents:	1✔
88	if doc.content is None:	1✔
89	raise ValueError(	1✔
90	f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None."
91	)
92	units = self._split_into_units(doc.content, self.split_by)	1✔
93	text_splits, splits_pages = self._concatenate_units(	1✔
94	units, self.split_length, self.split_overlap, self.split_threshold
95	)
96	metadata = deepcopy(doc.meta)	1✔
97	metadata["source_id"] = doc.id	1✔
98	split_docs += self._create_docs_from_splits(	1✔
99	text_splits=text_splits, splits_pages=splits_pages, meta=metadata
100	)
101	return {"documents": split_docs}	1✔
102
103	def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]:	1✔
104	if split_by == "page":	1✔
105	split_at = "\f"	1✔
106	elif split_by == "passage":	1✔
107	split_at = "\n\n"	1✔
108	elif split_by == "sentence":	1✔
109	split_at = "."	1✔
110	elif split_by == "word":	1✔
111	split_at = " "	1✔
112	else:
113	raise NotImplementedError(	×
114	"DocumentSplitter only supports 'word', 'sentence', 'page' or 'passage' split_by options."
115	)
116	units = text.split(split_at)	1✔
117	# Add the delimiter back to all units except the last one
118	for i in range(len(units) - 1):	1✔
119	units[i] += split_at	1✔
120	return units	1✔
121
122	def _concatenate_units(	1✔
123	self, elements: List[str], split_length: int, split_overlap: int, split_threshold: int
124	) -> Tuple[List[str], List[int]]:
125	"""
126	Concatenates the elements into parts of split_length units.
127
128	Keeps track of the original page number that each element belongs. If the length of the current units is less
129	than the pre-defined `split_threshold`, it does not create a new split. Instead, it concatenates the current
130	units with the last split, preventing the creation of excessively small splits.
131	"""
132
133	text_splits: List[str] = []	1✔
134	splits_pages = []	1✔
135	cur_page = 1	1✔
136	segments = windowed(elements, n=split_length, step=split_length - split_overlap)	1✔
137	for seg in segments:	1✔
138	current_units = [unit for unit in seg if unit is not None]	1✔
139	txt = "".join(current_units)	1✔
140	# check if length of current units is below split_threshold
141	if len(current_units) < split_threshold and len(text_splits) > 0:	1✔
142	# concatenate the last split with the current one
143	text_splits[-1] += txt	1✔
144	elif len(txt) > 0:	1✔
145	text_splits.append(txt)	1✔
146	splits_pages.append(cur_page)	1✔
147	processed_units = current_units[: split_length - split_overlap]	1✔
148	if self.split_by == "page":	1✔
149	num_page_breaks = len(processed_units)	1✔
150	else:
151	num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)	1✔
152	cur_page += num_page_breaks	1✔
153	return text_splits, splits_pages	1✔
154
155	@staticmethod	1✔
156	def _create_docs_from_splits(text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]:	1✔
157	"""
158	Creates Document objects from splits enriching them with page number and the metadata of the original document.
159	"""
160	documents: List[Document] = []	1✔
161
162	for i, txt in enumerate(text_splits):	1✔
163	meta = deepcopy(meta)	1✔
164	doc = Document(content=txt, meta=meta)	1✔
165	doc.meta["page_number"] = splits_pages[i]	1✔
166	documents.append(doc)	1✔
167	return documents	1✔

deepset-ai / haystack / 9568249476

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous