• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 15131674881

20 May 2025 07:35AM UTC coverage: 90.156% (-0.3%) from 90.471%
15131674881

Pull #9407

github

web-flow
Merge b382eca10 into 6ad23f822
Pull Request #9407: feat: stream `ToolResult` from run_async in Agent

10972 of 12170 relevant lines covered (90.16%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.88
haystack/components/embedders/sentence_transformers_document_embedder.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
from typing import Any, Dict, List, Literal, Optional
1✔
6

7
from haystack import Document, component, default_from_dict, default_to_dict
1✔
8
from haystack.components.embedders.backends.sentence_transformers_backend import (
1✔
9
    _SentenceTransformersEmbeddingBackend,
10
    _SentenceTransformersEmbeddingBackendFactory,
11
)
12
from haystack.utils import ComponentDevice, Secret, deserialize_secrets_inplace
1✔
13
from haystack.utils.hf import deserialize_hf_model_kwargs, serialize_hf_model_kwargs
1✔
14

15

16
@component
1✔
17
class SentenceTransformersDocumentEmbedder:
1✔
18
    """
19
    Calculates document embeddings using Sentence Transformers models.
20

21
    It stores the embeddings in the `embedding` metadata field of each document.
22
    You can also embed documents' metadata.
23
    Use this component in indexing pipelines to embed input documents
24
    and send them to DocumentWriter to write a into a Document Store.
25

26
    ### Usage example:
27

28
    ```python
29
    from haystack import Document
30
    from haystack.components.embedders import SentenceTransformersDocumentEmbedder
31
    doc = Document(content="I love pizza!")
32
    doc_embedder = SentenceTransformersDocumentEmbedder()
33
    doc_embedder.warm_up()
34

35
    result = doc_embedder.run([doc])
36
    print(result['documents'][0].embedding)
37

38
    # [-0.07804739475250244, 0.1498992145061493, ...]
39
    ```
40
    """
41

42
    def __init__(  # noqa: PLR0913 # pylint: disable=too-many-positional-arguments
1✔
43
        self,
44
        model: str = "sentence-transformers/all-mpnet-base-v2",
45
        device: Optional[ComponentDevice] = None,
46
        token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
47
        prefix: str = "",
48
        suffix: str = "",
49
        batch_size: int = 32,
50
        progress_bar: bool = True,
51
        normalize_embeddings: bool = False,
52
        meta_fields_to_embed: Optional[List[str]] = None,
53
        embedding_separator: str = "\n",
54
        trust_remote_code: bool = False,
55
        local_files_only: bool = False,
56
        truncate_dim: Optional[int] = None,
57
        model_kwargs: Optional[Dict[str, Any]] = None,
58
        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
59
        config_kwargs: Optional[Dict[str, Any]] = None,
60
        precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32",
61
        encode_kwargs: Optional[Dict[str, Any]] = None,
62
        backend: Literal["torch", "onnx", "openvino"] = "torch",
63
    ):
64
        """
65
        Creates a SentenceTransformersDocumentEmbedder component.
66

67
        :param model:
68
            The model to use for calculating embeddings.
69
            Pass a local path or ID of the model on Hugging Face.
70
        :param device:
71
            The device to use for loading the model.
72
            Overrides the default device.
73
        :param token:
74
            The API token to download private models from Hugging Face.
75
        :param prefix:
76
            A string to add at the beginning of each document text.
77
            Can be used to prepend the text with an instruction, as required by some embedding models,
78
            such as E5 and bge.
79
        :param suffix:
80
            A string to add at the end of each document text.
81
        :param batch_size:
82
            Number of documents to embed at once.
83
        :param progress_bar:
84
            If `True`, shows a progress bar when embedding documents.
85
        :param normalize_embeddings:
86
            If `True`, the embeddings are normalized using L2 normalization, so that each embedding has a norm of 1.
87
        :param meta_fields_to_embed:
88
            List of metadata fields to embed along with the document text.
89
        :param embedding_separator:
90
            Separator used to concatenate the metadata fields to the document text.
91
        :param trust_remote_code:
92
            If `False`, allows only Hugging Face verified model architectures.
93
            If `True`, allows custom models and scripts.
94
        :param local_files_only:
95
            If `True`, does not attempt to download the model from Hugging Face Hub and only looks at local files.
96
        :param truncate_dim:
97
            The dimension to truncate sentence embeddings to. `None` does no truncation.
98
            If the model wasn't trained with Matryoshka Representation Learning,
99
            truncating embeddings can significantly affect performance.
100
        :param model_kwargs:
101
            Additional keyword arguments for `AutoModelForSequenceClassification.from_pretrained`
102
            when loading the model. Refer to specific model documentation for available kwargs.
103
        :param tokenizer_kwargs:
104
            Additional keyword arguments for `AutoTokenizer.from_pretrained` when loading the tokenizer.
105
            Refer to specific model documentation for available kwargs.
106
        :param config_kwargs:
107
            Additional keyword arguments for `AutoConfig.from_pretrained` when loading the model configuration.
108
        :param precision:
109
            The precision to use for the embeddings.
110
            All non-float32 precisions are quantized embeddings.
111
            Quantized embeddings are smaller and faster to compute, but may have a lower accuracy.
112
            They are useful for reducing the size of the embeddings of a corpus for semantic search, among other tasks.
113
        :param encode_kwargs:
114
            Additional keyword arguments for `SentenceTransformer.encode` when embedding documents.
115
            This parameter is provided for fine customization. Be careful not to clash with already set parameters and
116
            avoid passing parameters that change the output type.
117
        :param backend:
118
            The backend to use for the Sentence Transformers model. Choose from "torch", "onnx", or "openvino".
119
            Refer to the [Sentence Transformers documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html)
120
            for more information on acceleration and quantization options.
121
        """
122

123
        self.model = model
1✔
124
        self.device = ComponentDevice.resolve_device(device)
1✔
125
        self.token = token
1✔
126
        self.prefix = prefix
1✔
127
        self.suffix = suffix
1✔
128
        self.batch_size = batch_size
1✔
129
        self.progress_bar = progress_bar
1✔
130
        self.normalize_embeddings = normalize_embeddings
1✔
131
        self.meta_fields_to_embed = meta_fields_to_embed or []
1✔
132
        self.embedding_separator = embedding_separator
1✔
133
        self.trust_remote_code = trust_remote_code
1✔
134
        self.local_files_only = local_files_only
1✔
135
        self.truncate_dim = truncate_dim
1✔
136
        self.model_kwargs = model_kwargs
1✔
137
        self.tokenizer_kwargs = tokenizer_kwargs
1✔
138
        self.config_kwargs = config_kwargs
1✔
139
        self.encode_kwargs = encode_kwargs
1✔
140
        self.embedding_backend: Optional[_SentenceTransformersEmbeddingBackend] = None
1✔
141
        self.precision = precision
1✔
142
        self.backend = backend
1✔
143

144
    def _get_telemetry_data(self) -> Dict[str, Any]:
1✔
145
        """
146
        Data that is sent to Posthog for usage analytics.
147
        """
148
        return {"model": self.model}
×
149

150
    def to_dict(self) -> Dict[str, Any]:
1✔
151
        """
152
        Serializes the component to a dictionary.
153

154
        :returns:
155
            Dictionary with serialized data.
156
        """
157
        serialization_dict = default_to_dict(
1✔
158
            self,
159
            model=self.model,
160
            device=self.device.to_dict(),
161
            token=self.token.to_dict() if self.token else None,
162
            prefix=self.prefix,
163
            suffix=self.suffix,
164
            batch_size=self.batch_size,
165
            progress_bar=self.progress_bar,
166
            normalize_embeddings=self.normalize_embeddings,
167
            meta_fields_to_embed=self.meta_fields_to_embed,
168
            embedding_separator=self.embedding_separator,
169
            trust_remote_code=self.trust_remote_code,
170
            local_files_only=self.local_files_only,
171
            truncate_dim=self.truncate_dim,
172
            model_kwargs=self.model_kwargs,
173
            tokenizer_kwargs=self.tokenizer_kwargs,
174
            config_kwargs=self.config_kwargs,
175
            precision=self.precision,
176
            encode_kwargs=self.encode_kwargs,
177
            backend=self.backend,
178
        )
179
        if serialization_dict["init_parameters"].get("model_kwargs") is not None:
1✔
180
            serialize_hf_model_kwargs(serialization_dict["init_parameters"]["model_kwargs"])
1✔
181
        return serialization_dict
1✔
182

183
    @classmethod
1✔
184
    def from_dict(cls, data: Dict[str, Any]) -> "SentenceTransformersDocumentEmbedder":
1✔
185
        """
186
        Deserializes the component from a dictionary.
187

188
        :param data:
189
            Dictionary to deserialize from.
190
        :returns:
191
            Deserialized component.
192
        """
193
        init_params = data["init_parameters"]
1✔
194
        if init_params.get("device") is not None:
1✔
195
            init_params["device"] = ComponentDevice.from_dict(init_params["device"])
1✔
196
        deserialize_secrets_inplace(init_params, keys=["token"])
1✔
197
        if init_params.get("model_kwargs") is not None:
1✔
198
            deserialize_hf_model_kwargs(init_params["model_kwargs"])
1✔
199
        return default_from_dict(cls, data)
1✔
200

201
    def warm_up(self):
1✔
202
        """
203
        Initializes the component.
204
        """
205
        if self.embedding_backend is None:
1✔
206
            self.embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend(
1✔
207
                model=self.model,
208
                device=self.device.to_torch_str(),
209
                auth_token=self.token,
210
                trust_remote_code=self.trust_remote_code,
211
                local_files_only=self.local_files_only,
212
                truncate_dim=self.truncate_dim,
213
                model_kwargs=self.model_kwargs,
214
                tokenizer_kwargs=self.tokenizer_kwargs,
215
                config_kwargs=self.config_kwargs,
216
                backend=self.backend,
217
            )
218
            if self.tokenizer_kwargs and self.tokenizer_kwargs.get("model_max_length"):
1✔
219
                self.embedding_backend.model.max_seq_length = self.tokenizer_kwargs["model_max_length"]
1✔
220

221
    @component.output_types(documents=List[Document])
1✔
222
    def run(self, documents: List[Document]):
1✔
223
        """
224
        Embed a list of documents.
225

226
        :param documents:
227
            Documents to embed.
228

229
        :returns:
230
            A dictionary with the following keys:
231
            - `documents`: Documents with embeddings.
232
        """
233
        if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
1✔
234
            raise TypeError(
1✔
235
                "SentenceTransformersDocumentEmbedder expects a list of Documents as input."
236
                "In case you want to embed a list of strings, please use the SentenceTransformersTextEmbedder."
237
            )
238
        if self.embedding_backend is None:
1✔
239
            raise RuntimeError("The embedding model has not been loaded. Please call warm_up() before running.")
×
240

241
        texts_to_embed = []
1✔
242
        for doc in documents:
1✔
243
            meta_values_to_embed = [
1✔
244
                str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key]
245
            ]
246
            text_to_embed = (
1✔
247
                self.prefix + self.embedding_separator.join(meta_values_to_embed + [doc.content or ""]) + self.suffix
248
            )
249
            texts_to_embed.append(text_to_embed)
1✔
250

251
        embeddings = self.embedding_backend.embed(
1✔
252
            texts_to_embed,
253
            batch_size=self.batch_size,
254
            show_progress_bar=self.progress_bar,
255
            normalize_embeddings=self.normalize_embeddings,
256
            precision=self.precision,
257
            **(self.encode_kwargs if self.encode_kwargs else {}),
258
        )
259

260
        for doc, emb in zip(documents, embeddings):
1✔
261
            doc.embedding = emb
1✔
262

263
        return {"documents": documents}
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc