• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 11461254431

22 Oct 2024 01:28PM UTC coverage: 90.468% (+0.1%) from 90.344%
11461254431

Pull #8463

github

web-flow
Merge cd7d00658 into 0157459a7
Pull Request #8463: fix: window_size set during run instead of construction

7517 of 8309 relevant lines covered (90.47%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.96
haystack/components/retrievers/sentence_window_retriever.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
from typing import Any, Dict, List, Optional
1✔
6

7
from haystack import Document, component, default_from_dict, default_to_dict
1✔
8
from haystack.document_stores.types import DocumentStore
1✔
9
from haystack.utils import deserialize_document_store_in_init_params_inplace
1✔
10

11

12
@component
1✔
13
class SentenceWindowRetriever:
1✔
14
    """
15
    Retrieves documents adjacent to a given document in the Document Store.
16

17
    During indexing, documents are broken into smaller chunks, or sentences. When you submit a query,
18
    the Retriever fetches the most relevant sentence. To provide full context,
19
    SentenceWindowRetriever fetches a number of neighboring sentences before and after each
20
    relevant one. You can set this number with the `window_size` parameter.
21
    It uses `source_id` and `doc.meta['split_id']` to locate the surrounding documents.
22

23
    This component works with existing Retrievers, like BM25Retriever or
24
    EmbeddingRetriever. First, use a Retriever to find documents based on a query and then use
25
    SentenceWindowRetriever to get the surrounding documents for context.
26

27
    The SentenceWindowRetriever is compatible with the following DocumentStores:
28
    - [Astra](https://docs.haystack.deepset.ai/docs/astradocumentstore)
29
    - [Elasticsearch](https://docs.haystack.deepset.ai/docs/elasticsearch-document-store)
30
    - [OpenSearch](https://docs.haystack.deepset.ai/docs/opensearch-document-store)
31
    - [Pgvector](https://docs.haystack.deepset.ai/docs/pgvectordocumentstore)
32
    - [Pinecone](https://docs.haystack.deepset.ai/docs/pinecone-document-store)
33
    - [Qdrant](https://docs.haystack.deepset.ai/docs/qdrant-document-store)
34

35
    ### Usage example
36

37
    ```python
38
    from haystack import Document, Pipeline
39
    from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
40
    from haystack.components.retrievers import SentenceWindowRetriever
41
    from haystack.components.preprocessors import DocumentSplitter
42
    from haystack.document_stores.in_memory import InMemoryDocumentStore
43

44
    splitter = DocumentSplitter(split_length=10, split_overlap=5, split_by="word")
45
    text = (
46
            "This is a text with some words. There is a second sentence. And there is also a third sentence. "
47
            "It also contains a fourth sentence. And a fifth sentence. And a sixth sentence. And a seventh sentence"
48
    )
49
    doc = Document(content=text)
50
    docs = splitter.run([doc])
51
    doc_store = InMemoryDocumentStore()
52
    doc_store.write_documents(docs["documents"])
53

54

55
    rag = Pipeline()
56
    rag.add_component("bm25_retriever", InMemoryBM25Retriever(doc_store, top_k=1))
57
    rag.add_component("sentence_window_retriever", SentenceWindowRetriever(document_store=doc_store, window_size=2))
58
    rag.connect("bm25_retriever", "sentence_window_retriever")
59

60
    rag.run({'bm25_retriever': {"query":"third"}})
61

62
    >> {'sentence_window_retriever': {'context_windows': ['some words. There is a second sentence.
63
    >> And there is also a third sentence. It also contains a fourth sentence. And a fifth sentence. And a sixth
64
    >> sentence. And a'], 'context_documents': [[Document(id=..., content: 'some words. There is a second sentence.
65
    >> And there is ', meta: {'source_id': '...', 'page_number': 1, 'split_id': 1, 'split_idx_start': 20,
66
    >> '_split_overlap': [{'doc_id': '...', 'range': (20, 43)}, {'doc_id': '...', 'range': (0, 30)}]}),
67
    >> Document(id=..., content: 'second sentence. And there is also a third sentence. It ',
68
    >> meta: {'source_id': '74ea87deb38012873cf8c07e...f19d01a26a098447113e1d7b83efd30c02987114', 'page_number': 1,
69
    >> 'split_id': 2, 'split_idx_start': 43, '_split_overlap': [{'doc_id': '...', 'range': (23, 53)}, {'doc_id': '...',
70
    >> 'range': (0, 26)}]}), Document(id=..., content: 'also a third sentence. It also contains a fourth sentence. ',
71
    >> meta: {'source_id': '...', 'page_number': 1, 'split_id': 3, 'split_idx_start': 73, '_split_overlap':
72
    >> [{'doc_id': '...', 'range': (30, 56)}, {'doc_id': '...', 'range': (0, 33)}]}), Document(id=..., content:
73
    >> 'also contains a fourth sentence. And a fifth sentence. And ', meta: {'source_id': '...', 'page_number': 1,
74
    >> 'split_id': 4, 'split_idx_start': 99, '_split_overlap': [{'doc_id': '...', 'range': (26, 59)},
75
    >> {'doc_id': '...', 'range': (0, 26)}]}), Document(id=..., content: 'And a fifth sentence. And a sixth sentence.
76
    >> And a ', meta: {'source_id': '...', 'page_number': 1, 'split_id': 5, 'split_idx_start': 132,
77
    >> '_split_overlap': [{'doc_id': '...', 'range': (33, 59)}, {'doc_id': '...', 'range': (0, 24)}]})]]}}}}
78
    ```
79
    """
80

81
    def __init__(self, document_store: DocumentStore, window_size: int = 3):
1✔
82
        """
83
        Creates a new SentenceWindowRetriever component.
84

85
        :param document_store: The Document Store to retrieve the surrounding documents from.
86
        :param window_size: The number of documents to retrieve before and after the relevant one.
87
                For example, `window_size: 2` fetches 2 preceding and 2 following documents.
88
        """
89
        if window_size < 1:
1✔
90
            raise ValueError("The window_size parameter must be greater than 0.")
1✔
91

92
        self.window_size = window_size
1✔
93
        self.document_store = document_store
1✔
94

95
    @staticmethod
1✔
96
    def merge_documents_text(documents: List[Document]) -> str:
1✔
97
        """
98
        Merge a list of document text into a single string.
99

100
        This functions concatenates the textual content of a list of documents into a single string, eliminating any
101
        overlapping content.
102

103
        :param documents: List of Documents to merge.
104
        """
105
        sorted_docs = sorted(documents, key=lambda doc: doc.meta["split_idx_start"])
1✔
106
        merged_text = ""
1✔
107
        last_idx_end = 0
1✔
108
        for doc in sorted_docs:
1✔
109
            start = doc.meta["split_idx_start"]  # start of the current content
1✔
110

111
            # if the start of the current content is before the end of the last appended content, adjust it
112
            start = max(start, last_idx_end)
1✔
113

114
            # append the non-overlapping part to the merged text
115
            merged_text += doc.content[start - doc.meta["split_idx_start"] :]  # type: ignore
1✔
116

117
            # update the last end index
118
            last_idx_end = doc.meta["split_idx_start"] + len(doc.content)  # type: ignore
1✔
119

120
        return merged_text
1✔
121

122
    def to_dict(self) -> Dict[str, Any]:
1✔
123
        """
124
        Serializes the component to a dictionary.
125

126
        :returns:
127
            Dictionary with serialized data.
128
        """
129
        docstore = self.document_store.to_dict()
1✔
130
        return default_to_dict(self, document_store=docstore, window_size=self.window_size)
1✔
131

132
    @classmethod
1✔
133
    def from_dict(cls, data: Dict[str, Any]) -> "SentenceWindowRetriever":
1✔
134
        """
135
        Deserializes the component from a dictionary.
136

137
        :returns:
138
            Deserialized component.
139
        """
140
        # deserialize the document store
141
        deserialize_document_store_in_init_params_inplace(data)
1✔
142

143
        # deserialize the component
144
        return default_from_dict(cls, data)
1✔
145

146
    @component.output_types(context_windows=List[str], context_documents=List[List[Document]])
1✔
147
    def run(self, retrieved_documents: List[Document], window_size: Optional[int] = None):
1✔
148
        """
149
        Based on the `source_id` and on the `doc.meta['split_id']` get surrounding documents from the document store.
150

151
        Implements the logic behind the sentence-window technique, retrieving the surrounding documents of a given
152
        document from the document store.
153

154
        :param retrieved_documents: List of retrieved documents from the previous retriever.
155
        :param window_size: The number of documents to retrieve before and after the relevant one. This will overwrite
156
                            the `window_size` parameter set in the constructor.
157
        :returns:
158
            A dictionary with the following keys:
159
                - `context_windows`: A list of strings, where each string represents the concatenated text from the
160
                                     context window of the corresponding document in `retrieved_documents`.
161
                - `context_documents`: A list of lists of `Document` objects, where each inner list contains the
162
                                     documents that come from the context window for the corresponding document in
163
                                     `retrieved_documents`.
164

165
        """
166
        window_size = window_size or self.window_size
1✔
167

168
        if window_size < 1:
1✔
169
            raise ValueError("The window_size parameter must be greater than 0.")
×
170

171
        if not all("split_id" in doc.meta for doc in retrieved_documents):
1✔
172
            raise ValueError("The retrieved documents must have 'split_id' in the metadata.")
1✔
173

174
        if not all("source_id" in doc.meta for doc in retrieved_documents):
1✔
175
            raise ValueError("The retrieved documents must have 'source_id' in the metadata.")
1✔
176

177
        context_text = []
1✔
178
        context_documents = []
1✔
179
        for doc in retrieved_documents:
1✔
180
            source_id = doc.meta["source_id"]
1✔
181
            split_id = doc.meta["split_id"]
1✔
182
            min_before = min(list(range(split_id - 1, split_id - window_size - 1, -1)))
1✔
183
            max_after = max(list(range(split_id + 1, split_id + window_size + 1, 1)))
1✔
184
            context_docs = self.document_store.filter_documents(
1✔
185
                {
186
                    "operator": "AND",
187
                    "conditions": [
188
                        {"field": "meta.source_id", "operator": "==", "value": source_id},
189
                        {"field": "meta.split_id", "operator": ">=", "value": min_before},
190
                        {"field": "meta.split_id", "operator": "<=", "value": max_after},
191
                    ],
192
                }
193
            )
194
            context_text.append(self.merge_documents_text(context_docs))
1✔
195
            context_documents.append(context_docs)
1✔
196

197
        return {"context_windows": context_text, "context_documents": context_documents}
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc