• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 16933015230

13 Aug 2025 09:18AM UTC coverage: 92.184% (+0.2%) from 91.969%
16933015230

Pull #9699

github

web-flow
Merge cfbd602e7 into 8160ea8bf
Pull Request #9699: feat: Update `source_id_meta_field` in `SentenceWindowRetriever` to also accept a list of values

12891 of 13984 relevant lines covered (92.18%)

0.92 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.97
haystack/components/embedders/sentence_transformers_document_embedder.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
from dataclasses import replace
1✔
6
from typing import Any, Literal, Optional
1✔
7

8
from haystack import Document, component, default_from_dict, default_to_dict
1✔
9
from haystack.components.embedders.backends.sentence_transformers_backend import (
1✔
10
    _SentenceTransformersEmbeddingBackend,
11
    _SentenceTransformersEmbeddingBackendFactory,
12
)
13
from haystack.utils import ComponentDevice, Secret, deserialize_secrets_inplace
1✔
14
from haystack.utils.hf import deserialize_hf_model_kwargs, serialize_hf_model_kwargs
1✔
15

16

17
@component
1✔
18
class SentenceTransformersDocumentEmbedder:
1✔
19
    """
20
    Calculates document embeddings using Sentence Transformers models.
21

22
    It stores the embeddings in the `embedding` metadata field of each document.
23
    You can also embed documents' metadata.
24
    Use this component in indexing pipelines to embed input documents
25
    and send them to DocumentWriter to write a into a Document Store.
26

27
    ### Usage example:
28

29
    ```python
30
    from haystack import Document
31
    from haystack.components.embedders import SentenceTransformersDocumentEmbedder
32
    doc = Document(content="I love pizza!")
33
    doc_embedder = SentenceTransformersDocumentEmbedder()
34
    doc_embedder.warm_up()
35

36
    result = doc_embedder.run([doc])
37
    print(result['documents'][0].embedding)
38

39
    # [-0.07804739475250244, 0.1498992145061493, ...]
40
    ```
41
    """
42

43
    def __init__(  # noqa: PLR0913 # pylint: disable=too-many-positional-arguments
1✔
44
        self,
45
        model: str = "sentence-transformers/all-mpnet-base-v2",
46
        device: Optional[ComponentDevice] = None,
47
        token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
48
        prefix: str = "",
49
        suffix: str = "",
50
        batch_size: int = 32,
51
        progress_bar: bool = True,
52
        normalize_embeddings: bool = False,
53
        meta_fields_to_embed: Optional[list[str]] = None,
54
        embedding_separator: str = "\n",
55
        trust_remote_code: bool = False,
56
        local_files_only: bool = False,
57
        truncate_dim: Optional[int] = None,
58
        model_kwargs: Optional[dict[str, Any]] = None,
59
        tokenizer_kwargs: Optional[dict[str, Any]] = None,
60
        config_kwargs: Optional[dict[str, Any]] = None,
61
        precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32",
62
        encode_kwargs: Optional[dict[str, Any]] = None,
63
        backend: Literal["torch", "onnx", "openvino"] = "torch",
64
    ):
65
        """
66
        Creates a SentenceTransformersDocumentEmbedder component.
67

68
        :param model:
69
            The model to use for calculating embeddings.
70
            Pass a local path or ID of the model on Hugging Face.
71
        :param device:
72
            The device to use for loading the model.
73
            Overrides the default device.
74
        :param token:
75
            The API token to download private models from Hugging Face.
76
        :param prefix:
77
            A string to add at the beginning of each document text.
78
            Can be used to prepend the text with an instruction, as required by some embedding models,
79
            such as E5 and bge.
80
        :param suffix:
81
            A string to add at the end of each document text.
82
        :param batch_size:
83
            Number of documents to embed at once.
84
        :param progress_bar:
85
            If `True`, shows a progress bar when embedding documents.
86
        :param normalize_embeddings:
87
            If `True`, the embeddings are normalized using L2 normalization, so that each embedding has a norm of 1.
88
        :param meta_fields_to_embed:
89
            List of metadata fields to embed along with the document text.
90
        :param embedding_separator:
91
            Separator used to concatenate the metadata fields to the document text.
92
        :param trust_remote_code:
93
            If `False`, allows only Hugging Face verified model architectures.
94
            If `True`, allows custom models and scripts.
95
        :param local_files_only:
96
            If `True`, does not attempt to download the model from Hugging Face Hub and only looks at local files.
97
        :param truncate_dim:
98
            The dimension to truncate sentence embeddings to. `None` does no truncation.
99
            If the model wasn't trained with Matryoshka Representation Learning,
100
            truncating embeddings can significantly affect performance.
101
        :param model_kwargs:
102
            Additional keyword arguments for `AutoModelForSequenceClassification.from_pretrained`
103
            when loading the model. Refer to specific model documentation for available kwargs.
104
        :param tokenizer_kwargs:
105
            Additional keyword arguments for `AutoTokenizer.from_pretrained` when loading the tokenizer.
106
            Refer to specific model documentation for available kwargs.
107
        :param config_kwargs:
108
            Additional keyword arguments for `AutoConfig.from_pretrained` when loading the model configuration.
109
        :param precision:
110
            The precision to use for the embeddings.
111
            All non-float32 precisions are quantized embeddings.
112
            Quantized embeddings are smaller and faster to compute, but may have a lower accuracy.
113
            They are useful for reducing the size of the embeddings of a corpus for semantic search, among other tasks.
114
        :param encode_kwargs:
115
            Additional keyword arguments for `SentenceTransformer.encode` when embedding documents.
116
            This parameter is provided for fine customization. Be careful not to clash with already set parameters and
117
            avoid passing parameters that change the output type.
118
        :param backend:
119
            The backend to use for the Sentence Transformers model. Choose from "torch", "onnx", or "openvino".
120
            Refer to the [Sentence Transformers documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html)
121
            for more information on acceleration and quantization options.
122
        """
123

124
        self.model = model
1✔
125
        self.device = ComponentDevice.resolve_device(device)
1✔
126
        self.token = token
1✔
127
        self.prefix = prefix
1✔
128
        self.suffix = suffix
1✔
129
        self.batch_size = batch_size
1✔
130
        self.progress_bar = progress_bar
1✔
131
        self.normalize_embeddings = normalize_embeddings
1✔
132
        self.meta_fields_to_embed = meta_fields_to_embed or []
1✔
133
        self.embedding_separator = embedding_separator
1✔
134
        self.trust_remote_code = trust_remote_code
1✔
135
        self.local_files_only = local_files_only
1✔
136
        self.truncate_dim = truncate_dim
1✔
137
        self.model_kwargs = model_kwargs
1✔
138
        self.tokenizer_kwargs = tokenizer_kwargs
1✔
139
        self.config_kwargs = config_kwargs
1✔
140
        self.encode_kwargs = encode_kwargs
1✔
141
        self.embedding_backend: Optional[_SentenceTransformersEmbeddingBackend] = None
1✔
142
        self.precision = precision
1✔
143
        self.backend = backend
1✔
144

145
    def _get_telemetry_data(self) -> dict[str, Any]:
1✔
146
        """
147
        Data that is sent to Posthog for usage analytics.
148
        """
149
        return {"model": self.model}
×
150

151
    def to_dict(self) -> dict[str, Any]:
1✔
152
        """
153
        Serializes the component to a dictionary.
154

155
        :returns:
156
            Dictionary with serialized data.
157
        """
158
        serialization_dict = default_to_dict(
1✔
159
            self,
160
            model=self.model,
161
            device=self.device.to_dict(),
162
            token=self.token.to_dict() if self.token else None,
163
            prefix=self.prefix,
164
            suffix=self.suffix,
165
            batch_size=self.batch_size,
166
            progress_bar=self.progress_bar,
167
            normalize_embeddings=self.normalize_embeddings,
168
            meta_fields_to_embed=self.meta_fields_to_embed,
169
            embedding_separator=self.embedding_separator,
170
            trust_remote_code=self.trust_remote_code,
171
            local_files_only=self.local_files_only,
172
            truncate_dim=self.truncate_dim,
173
            model_kwargs=self.model_kwargs,
174
            tokenizer_kwargs=self.tokenizer_kwargs,
175
            config_kwargs=self.config_kwargs,
176
            precision=self.precision,
177
            encode_kwargs=self.encode_kwargs,
178
            backend=self.backend,
179
        )
180
        if serialization_dict["init_parameters"].get("model_kwargs") is not None:
1✔
181
            serialize_hf_model_kwargs(serialization_dict["init_parameters"]["model_kwargs"])
1✔
182
        return serialization_dict
1✔
183

184
    @classmethod
1✔
185
    def from_dict(cls, data: dict[str, Any]) -> "SentenceTransformersDocumentEmbedder":
1✔
186
        """
187
        Deserializes the component from a dictionary.
188

189
        :param data:
190
            Dictionary to deserialize from.
191
        :returns:
192
            Deserialized component.
193
        """
194
        init_params = data["init_parameters"]
1✔
195
        if init_params.get("device") is not None:
1✔
196
            init_params["device"] = ComponentDevice.from_dict(init_params["device"])
1✔
197
        deserialize_secrets_inplace(init_params, keys=["token"])
1✔
198
        if init_params.get("model_kwargs") is not None:
1✔
199
            deserialize_hf_model_kwargs(init_params["model_kwargs"])
1✔
200
        return default_from_dict(cls, data)
1✔
201

202
    def warm_up(self):
1✔
203
        """
204
        Initializes the component.
205
        """
206
        if self.embedding_backend is None:
1✔
207
            self.embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend(
1✔
208
                model=self.model,
209
                device=self.device.to_torch_str(),
210
                auth_token=self.token,
211
                trust_remote_code=self.trust_remote_code,
212
                local_files_only=self.local_files_only,
213
                truncate_dim=self.truncate_dim,
214
                model_kwargs=self.model_kwargs,
215
                tokenizer_kwargs=self.tokenizer_kwargs,
216
                config_kwargs=self.config_kwargs,
217
                backend=self.backend,
218
            )
219
            if self.tokenizer_kwargs and self.tokenizer_kwargs.get("model_max_length"):
1✔
220
                self.embedding_backend.model.max_seq_length = self.tokenizer_kwargs["model_max_length"]
1✔
221

222
    @component.output_types(documents=list[Document])
1✔
223
    def run(self, documents: list[Document]):
1✔
224
        """
225
        Embed a list of documents.
226

227
        :param documents:
228
            Documents to embed.
229

230
        :returns:
231
            A dictionary with the following keys:
232
            - `documents`: Documents with embeddings.
233
        """
234
        if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
1✔
235
            raise TypeError(
1✔
236
                "SentenceTransformersDocumentEmbedder expects a list of Documents as input."
237
                "In case you want to embed a list of strings, please use the SentenceTransformersTextEmbedder."
238
            )
239
        if self.embedding_backend is None:
1✔
240
            raise RuntimeError("The embedding model has not been loaded. Please call warm_up() before running.")
×
241

242
        texts_to_embed = []
1✔
243
        for doc in documents:
1✔
244
            meta_values_to_embed = [
1✔
245
                str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key]
246
            ]
247
            text_to_embed = (
1✔
248
                self.prefix + self.embedding_separator.join(meta_values_to_embed + [doc.content or ""]) + self.suffix
249
            )
250
            texts_to_embed.append(text_to_embed)
1✔
251

252
        embeddings = self.embedding_backend.embed(
1✔
253
            texts_to_embed,
254
            batch_size=self.batch_size,
255
            show_progress_bar=self.progress_bar,
256
            normalize_embeddings=self.normalize_embeddings,
257
            precision=self.precision,
258
            **(self.encode_kwargs if self.encode_kwargs else {}),
259
        )
260

261
        new_documents = []
1✔
262
        for doc, emb in zip(documents, embeddings):
1✔
263
            new_documents.append(replace(doc, embedding=emb))
1✔
264

265
        return {"documents": new_documents}
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc