11461254431

Committed 22 Oct 2024 01:28PM UTC coverage: 90.468% (+0.1%) from 90.344%

Build # 11461254431

Build Type

Pull #8463

github

Committed by

web-flow

Commit Message

Merge cd7d00658 into 0157459a7

Pull Request Pull Request #8463: fix: window_size set during run instead of construction

Run Details

7517 of 8309 relevant lines covered (90.47%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.96

haystack/components/retrievers/sentence_window_retriever.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from typing import Any, Dict, List, Optional

from haystack import Document, component, default_from_dict, default_to_dict
from haystack.document_stores.types import DocumentStore
from haystack.utils import deserialize_document_store_in_init_params_inplace


@component
class SentenceWindowRetriever:
    """
    Retrieves documents adjacent to a given document in the Document Store.

    During indexing, documents are broken into smaller chunks, or sentences. When you submit a query,
    the Retriever fetches the most relevant sentence. To provide full context,
    SentenceWindowRetriever fetches a number of neighboring sentences before and after each
    relevant one. You can set this number with the `window_size` parameter.
    It uses `source_id` and `doc.meta['split_id']` to locate the surrounding documents.

    This component works with existing Retrievers, like BM25Retriever or
    EmbeddingRetriever. First, use a Retriever to find documents based on a query and then use
    SentenceWindowRetriever to get the surrounding documents for context.

    The SentenceWindowRetriever is compatible with the following DocumentStores:
    - [Astra](https://docs.haystack.deepset.ai/docs/astradocumentstore)
    - [Elasticsearch](https://docs.haystack.deepset.ai/docs/elasticsearch-document-store)
    - [OpenSearch](https://docs.haystack.deepset.ai/docs/opensearch-document-store)
    - [Pgvector](https://docs.haystack.deepset.ai/docs/pgvectordocumentstore)
    - [Pinecone](https://docs.haystack.deepset.ai/docs/pinecone-document-store)
    - [Qdrant](https://docs.haystack.deepset.ai/docs/qdrant-document-store)

    ### Usage example

    ```python
    from haystack import Document, Pipeline
    from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
    from haystack.components.retrievers import SentenceWindowRetriever
    from haystack.components.preprocessors import DocumentSplitter
    from haystack.document_stores.in_memory import InMemoryDocumentStore

    splitter = DocumentSplitter(split_length=10, split_overlap=5, split_by="word")
    text = (
            "This is a text with some words. There is a second sentence. And there is also a third sentence. "
            "It also contains a fourth sentence. And a fifth sentence. And a sixth sentence. And a seventh sentence"
    )
    doc = Document(content=text)
    docs = splitter.run([doc])
    doc_store = InMemoryDocumentStore()
    doc_store.write_documents(docs["documents"])


    rag = Pipeline()
    rag.add_component("bm25_retriever", InMemoryBM25Retriever(doc_store, top_k=1))
    rag.add_component("sentence_window_retriever", SentenceWindowRetriever(document_store=doc_store, window_size=2))
    rag.connect("bm25_retriever", "sentence_window_retriever")

    rag.run({'bm25_retriever': {"query":"third"}})

    >> {'sentence_window_retriever': {'context_windows': ['some words. There is a second sentence.
    >> And there is also a third sentence. It also contains a fourth sentence. And a fifth sentence. And a sixth
    >> sentence. And a'], 'context_documents': [[Document(id=..., content: 'some words. There is a second sentence.
    >> And there is ', meta: {'source_id': '...', 'page_number': 1, 'split_id': 1, 'split_idx_start': 20,
    >> '_split_overlap': [{'doc_id': '...', 'range': (20, 43)}, {'doc_id': '...', 'range': (0, 30)}]}),
    >> Document(id=..., content: 'second sentence. And there is also a third sentence. It ',
    >> meta: {'source_id': '74ea87deb38012873cf8c07e...f19d01a26a098447113e1d7b83efd30c02987114', 'page_number': 1,
    >> 'split_id': 2, 'split_idx_start': 43, '_split_overlap': [{'doc_id': '...', 'range': (23, 53)}, {'doc_id': '...',
    >> 'range': (0, 26)}]}), Document(id=..., content: 'also a third sentence. It also contains a fourth sentence. ',
    >> meta: {'source_id': '...', 'page_number': 1, 'split_id': 3, 'split_idx_start': 73, '_split_overlap':
    >> [{'doc_id': '...', 'range': (30, 56)}, {'doc_id': '...', 'range': (0, 33)}]}), Document(id=..., content:
    >> 'also contains a fourth sentence. And a fifth sentence. And ', meta: {'source_id': '...', 'page_number': 1,
    >> 'split_id': 4, 'split_idx_start': 99, '_split_overlap': [{'doc_id': '...', 'range': (26, 59)},
    >> {'doc_id': '...', 'range': (0, 26)}]}), Document(id=..., content: 'And a fifth sentence. And a sixth sentence.
    >> And a ', meta: {'source_id': '...', 'page_number': 1, 'split_id': 5, 'split_idx_start': 132,
    >> '_split_overlap': [{'doc_id': '...', 'range': (33, 59)}, {'doc_id': '...', 'range': (0, 24)}]})]]}}}}
    ```
    """

    def __init__(self, document_store: DocumentStore, window_size: int = 3):
        """
        Creates a new SentenceWindowRetriever component.

        :param document_store: The Document Store to retrieve the surrounding documents from.
        :param window_size: The number of documents to retrieve before and after the relevant one.
                For example, `window_size: 2` fetches 2 preceding and 2 following documents.
        """
        if window_size < 1:
            raise ValueError("The window_size parameter must be greater than 0.")

        self.window_size = window_size
        self.document_store = document_store

    @staticmethod
    def merge_documents_text(documents: List[Document]) -> str:
        """
        Merge a list of document text into a single string.

        This functions concatenates the textual content of a list of documents into a single string, eliminating any
        overlapping content.

        :param documents: List of Documents to merge.
        """
        sorted_docs = sorted(documents, key=lambda doc: doc.meta["split_idx_start"])
        merged_text = ""
        last_idx_end = 0
        for doc in sorted_docs:
            start = doc.meta["split_idx_start"]  # start of the current content

            # if the start of the current content is before the end of the last appended content, adjust it
            start = max(start, last_idx_end)

            # append the non-overlapping part to the merged text
            merged_text += doc.content[start - doc.meta["split_idx_start"] :]  # type: ignore

            # update the last end index
            last_idx_end = doc.meta["split_idx_start"] + len(doc.content)  # type: ignore

        return merged_text

    def to_dict(self) -> Dict[str, Any]:
        """
        Serializes the component to a dictionary.

        :returns:
            Dictionary with serialized data.
        """
        docstore = self.document_store.to_dict()
        return default_to_dict(self, document_store=docstore, window_size=self.window_size)

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "SentenceWindowRetriever":
        """
        Deserializes the component from a dictionary.

        :returns:
            Deserialized component.
        """
        # deserialize the document store
        deserialize_document_store_in_init_params_inplace(data)

        # deserialize the component
        return default_from_dict(cls, data)

    @component.output_types(context_windows=List[str], context_documents=List[List[Document]])
    def run(self, retrieved_documents: List[Document], window_size: Optional[int] = None):
        """
        Based on the `source_id` and on the `doc.meta['split_id']` get surrounding documents from the document store.

        Implements the logic behind the sentence-window technique, retrieving the surrounding documents of a given
        document from the document store.

        :param retrieved_documents: List of retrieved documents from the previous retriever.
        :param window_size: The number of documents to retrieve before and after the relevant one. This will overwrite
                            the `window_size` parameter set in the constructor.
        :returns:
            A dictionary with the following keys:
                - `context_windows`: A list of strings, where each string represents the concatenated text from the
                                     context window of the corresponding document in `retrieved_documents`.
                - `context_documents`: A list of lists of `Document` objects, where each inner list contains the
                                     documents that come from the context window for the corresponding document in
                                     `retrieved_documents`.

        """
        window_size = window_size or self.window_size

        if window_size < 1:
            raise ValueError("The window_size parameter must be greater than 0.")

        if not all("split_id" in doc.meta for doc in retrieved_documents):
            raise ValueError("The retrieved documents must have 'split_id' in the metadata.")

        if not all("source_id" in doc.meta for doc in retrieved_documents):
            raise ValueError("The retrieved documents must have 'source_id' in the metadata.")

        context_text = []
        context_documents = []
        for doc in retrieved_documents:
            source_id = doc.meta["source_id"]
            split_id = doc.meta["split_id"]
            min_before = min(list(range(split_id - 1, split_id - window_size - 1, -1)))
            max_after = max(list(range(split_id + 1, split_id + window_size + 1, 1)))
            context_docs = self.document_store.filter_documents(
                {
                    "operator": "AND",
                    "conditions": [
                        {"field": "meta.source_id", "operator": "==", "value": source_id},
                        {"field": "meta.split_id", "operator": ">=", "value": min_before},
                        {"field": "meta.split_id", "operator": "<=", "value": max_after},
                    ],
                }
            )
            context_text.append(self.merge_documents_text(context_docs))
            context_documents.append(context_docs)

        return {"context_windows": context_text, "context_documents": context_documents}

1	# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2	#
3	# SPDX-License-Identifier: Apache-2.0
4
5	from typing import Any, Dict, List, Optional	1✔
6
7	from haystack import Document, component, default_from_dict, default_to_dict	1✔
8	from haystack.document_stores.types import DocumentStore	1✔
9	from haystack.utils import deserialize_document_store_in_init_params_inplace	1✔
10
11
12	@component	1✔
13	class SentenceWindowRetriever:	1✔
14	"""
15	Retrieves documents adjacent to a given document in the Document Store.
16
17	During indexing, documents are broken into smaller chunks, or sentences. When you submit a query,
18	the Retriever fetches the most relevant sentence. To provide full context,
19	SentenceWindowRetriever fetches a number of neighboring sentences before and after each
20	relevant one. You can set this number with the `window_size` parameter.
21	It uses `source_id` and `doc.meta['split_id']` to locate the surrounding documents.
22
23	This component works with existing Retrievers, like BM25Retriever or
24	EmbeddingRetriever. First, use a Retriever to find documents based on a query and then use
25	SentenceWindowRetriever to get the surrounding documents for context.
26
27	The SentenceWindowRetriever is compatible with the following DocumentStores:
28	- [Astra](https://docs.haystack.deepset.ai/docs/astradocumentstore)
29	- [Elasticsearch](https://docs.haystack.deepset.ai/docs/elasticsearch-document-store)
30	- [OpenSearch](https://docs.haystack.deepset.ai/docs/opensearch-document-store)
31	- [Pgvector](https://docs.haystack.deepset.ai/docs/pgvectordocumentstore)
32	- [Pinecone](https://docs.haystack.deepset.ai/docs/pinecone-document-store)
33	- [Qdrant](https://docs.haystack.deepset.ai/docs/qdrant-document-store)
34
35	### Usage example
36
37	```python
38	from haystack import Document, Pipeline
39	from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
40	from haystack.components.retrievers import SentenceWindowRetriever
41	from haystack.components.preprocessors import DocumentSplitter
42	from haystack.document_stores.in_memory import InMemoryDocumentStore
43
44	splitter = DocumentSplitter(split_length=10, split_overlap=5, split_by="word")
45	text = (
46	"This is a text with some words. There is a second sentence. And there is also a third sentence. "
47	"It also contains a fourth sentence. And a fifth sentence. And a sixth sentence. And a seventh sentence"
48	)
49	doc = Document(content=text)
50	docs = splitter.run([doc])
51	doc_store = InMemoryDocumentStore()
52	doc_store.write_documents(docs["documents"])
53
54
55	rag = Pipeline()
56	rag.add_component("bm25_retriever", InMemoryBM25Retriever(doc_store, top_k=1))
57	rag.add_component("sentence_window_retriever", SentenceWindowRetriever(document_store=doc_store, window_size=2))
58	rag.connect("bm25_retriever", "sentence_window_retriever")
59
60	rag.run({'bm25_retriever': {"query":"third"}})
61
62	>> {'sentence_window_retriever': {'context_windows': ['some words. There is a second sentence.
63	>> And there is also a third sentence. It also contains a fourth sentence. And a fifth sentence. And a sixth
64	>> sentence. And a'], 'context_documents': [[Document(id=..., content: 'some words. There is a second sentence.
65	>> And there is ', meta: {'source_id': '...', 'page_number': 1, 'split_id': 1, 'split_idx_start': 20,
66	>> '_split_overlap': [{'doc_id': '...', 'range': (20, 43)}, {'doc_id': '...', 'range': (0, 30)}]}),
67	>> Document(id=..., content: 'second sentence. And there is also a third sentence. It ',
68	>> meta: {'source_id': '74ea87deb38012873cf8c07e...f19d01a26a098447113e1d7b83efd30c02987114', 'page_number': 1,
69	>> 'split_id': 2, 'split_idx_start': 43, '_split_overlap': [{'doc_id': '...', 'range': (23, 53)}, {'doc_id': '...',
70	>> 'range': (0, 26)}]}), Document(id=..., content: 'also a third sentence. It also contains a fourth sentence. ',
71	>> meta: {'source_id': '...', 'page_number': 1, 'split_id': 3, 'split_idx_start': 73, '_split_overlap':
72	>> [{'doc_id': '...', 'range': (30, 56)}, {'doc_id': '...', 'range': (0, 33)}]}), Document(id=..., content:
73	>> 'also contains a fourth sentence. And a fifth sentence. And ', meta: {'source_id': '...', 'page_number': 1,
74	>> 'split_id': 4, 'split_idx_start': 99, '_split_overlap': [{'doc_id': '...', 'range': (26, 59)},
75	>> {'doc_id': '...', 'range': (0, 26)}]}), Document(id=..., content: 'And a fifth sentence. And a sixth sentence.
76	>> And a ', meta: {'source_id': '...', 'page_number': 1, 'split_id': 5, 'split_idx_start': 132,
77	>> '_split_overlap': [{'doc_id': '...', 'range': (33, 59)}, {'doc_id': '...', 'range': (0, 24)}]})]]}}}}
78	```
79	"""
80
81	def __init__(self, document_store: DocumentStore, window_size: int = 3):	1✔
82	"""
83	Creates a new SentenceWindowRetriever component.
84
85	:param document_store: The Document Store to retrieve the surrounding documents from.
86	:param window_size: The number of documents to retrieve before and after the relevant one.
87	For example, `window_size: 2` fetches 2 preceding and 2 following documents.
88	"""
89	if window_size < 1:	1✔
90	raise ValueError("The window_size parameter must be greater than 0.")	1✔
91
92	self.window_size = window_size	1✔
93	self.document_store = document_store	1✔
94
95	@staticmethod	1✔
96	def merge_documents_text(documents: List[Document]) -> str:	1✔
97	"""
98	Merge a list of document text into a single string.
99
100	This functions concatenates the textual content of a list of documents into a single string, eliminating any
101	overlapping content.
102
103	:param documents: List of Documents to merge.
104	"""
105	sorted_docs = sorted(documents, key=lambda doc: doc.meta["split_idx_start"])	1✔
106	merged_text = ""	1✔
107	last_idx_end = 0	1✔
108	for doc in sorted_docs:	1✔
109	start = doc.meta["split_idx_start"] # start of the current content	1✔
110
111	# if the start of the current content is before the end of the last appended content, adjust it
112	start = max(start, last_idx_end)	1✔
113
114	# append the non-overlapping part to the merged text
115	merged_text += doc.content[start - doc.meta["split_idx_start"] :] # type: ignore	1✔
116
117	# update the last end index
118	last_idx_end = doc.meta["split_idx_start"] + len(doc.content) # type: ignore	1✔
119
120	return merged_text	1✔
121
122	def to_dict(self) -> Dict[str, Any]:	1✔
123	"""
124	Serializes the component to a dictionary.
125
126	:returns:
127	Dictionary with serialized data.
128	"""
129	docstore = self.document_store.to_dict()	1✔
130	return default_to_dict(self, document_store=docstore, window_size=self.window_size)	1✔
131
132	@classmethod	1✔
133	def from_dict(cls, data: Dict[str, Any]) -> "SentenceWindowRetriever":	1✔
134	"""
135	Deserializes the component from a dictionary.
136
137	:returns:
138	Deserialized component.
139	"""
140	# deserialize the document store
141	deserialize_document_store_in_init_params_inplace(data)	1✔
142
143	# deserialize the component
144	return default_from_dict(cls, data)	1✔
145
146	@component.output_types(context_windows=List[str], context_documents=List[List[Document]])	1✔
147	def run(self, retrieved_documents: List[Document], window_size: Optional[int] = None):	1✔
148	"""
149	Based on the `source_id` and on the `doc.meta['split_id']` get surrounding documents from the document store.
150
151	Implements the logic behind the sentence-window technique, retrieving the surrounding documents of a given
152	document from the document store.
153
154	:param retrieved_documents: List of retrieved documents from the previous retriever.
155	:param window_size: The number of documents to retrieve before and after the relevant one. This will overwrite
156	the `window_size` parameter set in the constructor.
157	:returns:
158	A dictionary with the following keys:
159	- `context_windows`: A list of strings, where each string represents the concatenated text from the
160	context window of the corresponding document in `retrieved_documents`.
161	- `context_documents`: A list of lists of `Document` objects, where each inner list contains the
162	documents that come from the context window for the corresponding document in
163	`retrieved_documents`.
164
165	"""
166	window_size = window_size or self.window_size	1✔
167
168	if window_size < 1:	1✔
169	raise ValueError("The window_size parameter must be greater than 0.")	×
170
171	if not all("split_id" in doc.meta for doc in retrieved_documents):	1✔
172	raise ValueError("The retrieved documents must have 'split_id' in the metadata.")	1✔
173
174	if not all("source_id" in doc.meta for doc in retrieved_documents):	1✔
175	raise ValueError("The retrieved documents must have 'source_id' in the metadata.")	1✔
176
177	context_text = []	1✔
178	context_documents = []	1✔
179	for doc in retrieved_documents:	1✔
180	source_id = doc.meta["source_id"]	1✔
181	split_id = doc.meta["split_id"]	1✔
182	min_before = min(list(range(split_id - 1, split_id - window_size - 1, -1)))	1✔
183	max_after = max(list(range(split_id + 1, split_id + window_size + 1, 1)))	1✔
184	context_docs = self.document_store.filter_documents(	1✔
185	{
186	"operator": "AND",
187	"conditions": [
188	{"field": "meta.source_id", "operator": "==", "value": source_id},
189	{"field": "meta.split_id", "operator": ">=", "value": min_before},
190	{"field": "meta.split_id", "operator": "<=", "value": max_after},
191	],
192	}
193	)
194	context_text.append(self.merge_documents_text(context_docs))	1✔
195	context_documents.append(context_docs)	1✔
196
197	return {"context_windows": context_text, "context_documents": context_documents}	1✔

deepset-ai / haystack / 11461254431

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous