15131674881

Committed 20 May 2025 07:35AM UTC coverage: 90.156% (-0.3%) from 90.471%

Build # 15131674881

Build Type

Pull #9407

github

Committed by

web-flow

Commit Message

Merge b382eca10 into 6ad23f822

Pull Request Pull Request #9407: feat: stream `ToolResult` from run_async in Agent

Run Details

10972 of 12170 relevant lines covered (90.16%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.88

haystack/components/embedders/sentence_transformers_document_embedder.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from typing import Any, Dict, List, Literal, Optional

from haystack import Document, component, default_from_dict, default_to_dict
from haystack.components.embedders.backends.sentence_transformers_backend import (
    _SentenceTransformersEmbeddingBackend,
    _SentenceTransformersEmbeddingBackendFactory,
)
from haystack.utils import ComponentDevice, Secret, deserialize_secrets_inplace
from haystack.utils.hf import deserialize_hf_model_kwargs, serialize_hf_model_kwargs


@component
class SentenceTransformersDocumentEmbedder:
    """
    Calculates document embeddings using Sentence Transformers models.

    It stores the embeddings in the `embedding` metadata field of each document.
    You can also embed documents' metadata.
    Use this component in indexing pipelines to embed input documents
    and send them to DocumentWriter to write a into a Document Store.

    ### Usage example:

    ```python
    from haystack import Document
    from haystack.components.embedders import SentenceTransformersDocumentEmbedder
    doc = Document(content="I love pizza!")
    doc_embedder = SentenceTransformersDocumentEmbedder()
    doc_embedder.warm_up()

    result = doc_embedder.run([doc])
    print(result['documents'][0].embedding)

    # [-0.07804739475250244, 0.1498992145061493, ...]
    ```
    """

    def __init__(  # noqa: PLR0913 # pylint: disable=too-many-positional-arguments
        self,
        model: str = "sentence-transformers/all-mpnet-base-v2",
        device: Optional[ComponentDevice] = None,
        token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
        prefix: str = "",
        suffix: str = "",
        batch_size: int = 32,
        progress_bar: bool = True,
        normalize_embeddings: bool = False,
        meta_fields_to_embed: Optional[List[str]] = None,
        embedding_separator: str = "\n",
        trust_remote_code: bool = False,
        local_files_only: bool = False,
        truncate_dim: Optional[int] = None,
        model_kwargs: Optional[Dict[str, Any]] = None,
        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
        config_kwargs: Optional[Dict[str, Any]] = None,
        precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32",
        encode_kwargs: Optional[Dict[str, Any]] = None,
        backend: Literal["torch", "onnx", "openvino"] = "torch",
    ):
        """
        Creates a SentenceTransformersDocumentEmbedder component.

        :param model:
            The model to use for calculating embeddings.
            Pass a local path or ID of the model on Hugging Face.
        :param device:
            The device to use for loading the model.
            Overrides the default device.
        :param token:
            The API token to download private models from Hugging Face.
        :param prefix:
            A string to add at the beginning of each document text.
            Can be used to prepend the text with an instruction, as required by some embedding models,
            such as E5 and bge.
        :param suffix:
            A string to add at the end of each document text.
        :param batch_size:
            Number of documents to embed at once.
        :param progress_bar:
            If `True`, shows a progress bar when embedding documents.
        :param normalize_embeddings:
            If `True`, the embeddings are normalized using L2 normalization, so that each embedding has a norm of 1.
        :param meta_fields_to_embed:
            List of metadata fields to embed along with the document text.
        :param embedding_separator:
            Separator used to concatenate the metadata fields to the document text.
        :param trust_remote_code:
            If `False`, allows only Hugging Face verified model architectures.
            If `True`, allows custom models and scripts.
        :param local_files_only:
            If `True`, does not attempt to download the model from Hugging Face Hub and only looks at local files.
        :param truncate_dim:
            The dimension to truncate sentence embeddings to. `None` does no truncation.
            If the model wasn't trained with Matryoshka Representation Learning,
            truncating embeddings can significantly affect performance.
        :param model_kwargs:
            Additional keyword arguments for `AutoModelForSequenceClassification.from_pretrained`
            when loading the model. Refer to specific model documentation for available kwargs.
        :param tokenizer_kwargs:
            Additional keyword arguments for `AutoTokenizer.from_pretrained` when loading the tokenizer.
            Refer to specific model documentation for available kwargs.
        :param config_kwargs:
            Additional keyword arguments for `AutoConfig.from_pretrained` when loading the model configuration.
        :param precision:
            The precision to use for the embeddings.
            All non-float32 precisions are quantized embeddings.
            Quantized embeddings are smaller and faster to compute, but may have a lower accuracy.
            They are useful for reducing the size of the embeddings of a corpus for semantic search, among other tasks.
        :param encode_kwargs:
            Additional keyword arguments for `SentenceTransformer.encode` when embedding documents.
            This parameter is provided for fine customization. Be careful not to clash with already set parameters and
            avoid passing parameters that change the output type.
        :param backend:
            The backend to use for the Sentence Transformers model. Choose from "torch", "onnx", or "openvino".
            Refer to the [Sentence Transformers documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html)
            for more information on acceleration and quantization options.
        """

        self.model = model
        self.device = ComponentDevice.resolve_device(device)
        self.token = token
        self.prefix = prefix
        self.suffix = suffix
        self.batch_size = batch_size
        self.progress_bar = progress_bar
        self.normalize_embeddings = normalize_embeddings
        self.meta_fields_to_embed = meta_fields_to_embed or []
        self.embedding_separator = embedding_separator
        self.trust_remote_code = trust_remote_code
        self.local_files_only = local_files_only
        self.truncate_dim = truncate_dim
        self.model_kwargs = model_kwargs
        self.tokenizer_kwargs = tokenizer_kwargs
        self.config_kwargs = config_kwargs
        self.encode_kwargs = encode_kwargs
        self.embedding_backend: Optional[_SentenceTransformersEmbeddingBackend] = None
        self.precision = precision
        self.backend = backend

    def _get_telemetry_data(self) -> Dict[str, Any]:
        """
        Data that is sent to Posthog for usage analytics.
        """
        return {"model": self.model}

    def to_dict(self) -> Dict[str, Any]:
        """
        Serializes the component to a dictionary.

        :returns:
            Dictionary with serialized data.
        """
        serialization_dict = default_to_dict(
            self,
            model=self.model,
            device=self.device.to_dict(),
            token=self.token.to_dict() if self.token else None,
            prefix=self.prefix,
            suffix=self.suffix,
            batch_size=self.batch_size,
            progress_bar=self.progress_bar,
            normalize_embeddings=self.normalize_embeddings,
            meta_fields_to_embed=self.meta_fields_to_embed,
            embedding_separator=self.embedding_separator,
            trust_remote_code=self.trust_remote_code,
            local_files_only=self.local_files_only,
            truncate_dim=self.truncate_dim,
            model_kwargs=self.model_kwargs,
            tokenizer_kwargs=self.tokenizer_kwargs,
            config_kwargs=self.config_kwargs,
            precision=self.precision,
            encode_kwargs=self.encode_kwargs,
            backend=self.backend,
        )
        if serialization_dict["init_parameters"].get("model_kwargs") is not None:
            serialize_hf_model_kwargs(serialization_dict["init_parameters"]["model_kwargs"])
        return serialization_dict

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "SentenceTransformersDocumentEmbedder":
        """
        Deserializes the component from a dictionary.

        :param data:
            Dictionary to deserialize from.
        :returns:
            Deserialized component.
        """
        init_params = data["init_parameters"]
        if init_params.get("device") is not None:
            init_params["device"] = ComponentDevice.from_dict(init_params["device"])
        deserialize_secrets_inplace(init_params, keys=["token"])
        if init_params.get("model_kwargs") is not None:
            deserialize_hf_model_kwargs(init_params["model_kwargs"])
        return default_from_dict(cls, data)

    def warm_up(self):
        """
        Initializes the component.
        """
        if self.embedding_backend is None:
            self.embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend(
                model=self.model,
                device=self.device.to_torch_str(),
                auth_token=self.token,
                trust_remote_code=self.trust_remote_code,
                local_files_only=self.local_files_only,
                truncate_dim=self.truncate_dim,
                model_kwargs=self.model_kwargs,
                tokenizer_kwargs=self.tokenizer_kwargs,
                config_kwargs=self.config_kwargs,
                backend=self.backend,
            )
            if self.tokenizer_kwargs and self.tokenizer_kwargs.get("model_max_length"):
                self.embedding_backend.model.max_seq_length = self.tokenizer_kwargs["model_max_length"]

    @component.output_types(documents=List[Document])
    def run(self, documents: List[Document]):
        """
        Embed a list of documents.

        :param documents:
            Documents to embed.

        :returns:
            A dictionary with the following keys:
            - `documents`: Documents with embeddings.
        """
        if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
            raise TypeError(
                "SentenceTransformersDocumentEmbedder expects a list of Documents as input."
                "In case you want to embed a list of strings, please use the SentenceTransformersTextEmbedder."
            )
        if self.embedding_backend is None:
            raise RuntimeError("The embedding model has not been loaded. Please call warm_up() before running.")

        texts_to_embed = []
        for doc in documents:
            meta_values_to_embed = [
                str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key]
            ]
            text_to_embed = (
                self.prefix + self.embedding_separator.join(meta_values_to_embed + [doc.content or ""]) + self.suffix
            )
            texts_to_embed.append(text_to_embed)

        embeddings = self.embedding_backend.embed(
            texts_to_embed,
            batch_size=self.batch_size,
            show_progress_bar=self.progress_bar,
            normalize_embeddings=self.normalize_embeddings,
            precision=self.precision,
            **(self.encode_kwargs if self.encode_kwargs else {}),
        )

        for doc, emb in zip(documents, embeddings):
            doc.embedding = emb

        return {"documents": documents}

1	# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2	#
3	# SPDX-License-Identifier: Apache-2.0
4
5	from typing import Any, Dict, List, Literal, Optional	1✔
6
7	from haystack import Document, component, default_from_dict, default_to_dict	1✔
8	from haystack.components.embedders.backends.sentence_transformers_backend import (	1✔
9	_SentenceTransformersEmbeddingBackend,
10	_SentenceTransformersEmbeddingBackendFactory,
11	)
12	from haystack.utils import ComponentDevice, Secret, deserialize_secrets_inplace	1✔
13	from haystack.utils.hf import deserialize_hf_model_kwargs, serialize_hf_model_kwargs	1✔
14
15
16	@component	1✔
17	class SentenceTransformersDocumentEmbedder:	1✔
18	"""
19	Calculates document embeddings using Sentence Transformers models.
20
21	It stores the embeddings in the `embedding` metadata field of each document.
22	You can also embed documents' metadata.
23	Use this component in indexing pipelines to embed input documents
24	and send them to DocumentWriter to write a into a Document Store.
25
26	### Usage example:
27
28	```python
29	from haystack import Document
30	from haystack.components.embedders import SentenceTransformersDocumentEmbedder
31	doc = Document(content="I love pizza!")
32	doc_embedder = SentenceTransformersDocumentEmbedder()
33	doc_embedder.warm_up()
34
35	result = doc_embedder.run([doc])
36	print(result['documents'][0].embedding)
37
38	# [-0.07804739475250244, 0.1498992145061493, ...]
39	```
40	"""
41
42	def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments	1✔
43	self,
44	model: str = "sentence-transformers/all-mpnet-base-v2",
45	device: Optional[ComponentDevice] = None,
46	token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
47	prefix: str = "",
48	suffix: str = "",
49	batch_size: int = 32,
50	progress_bar: bool = True,
51	normalize_embeddings: bool = False,
52	meta_fields_to_embed: Optional[List[str]] = None,
53	embedding_separator: str = "\n",
54	trust_remote_code: bool = False,
55	local_files_only: bool = False,
56	truncate_dim: Optional[int] = None,
57	model_kwargs: Optional[Dict[str, Any]] = None,
58	tokenizer_kwargs: Optional[Dict[str, Any]] = None,
59	config_kwargs: Optional[Dict[str, Any]] = None,
60	precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32",
61	encode_kwargs: Optional[Dict[str, Any]] = None,
62	backend: Literal["torch", "onnx", "openvino"] = "torch",
63	):
64	"""
65	Creates a SentenceTransformersDocumentEmbedder component.
66
67	:param model:
68	The model to use for calculating embeddings.
69	Pass a local path or ID of the model on Hugging Face.
70	:param device:
71	The device to use for loading the model.
72	Overrides the default device.
73	:param token:
74	The API token to download private models from Hugging Face.
75	:param prefix:
76	A string to add at the beginning of each document text.
77	Can be used to prepend the text with an instruction, as required by some embedding models,
78	such as E5 and bge.
79	:param suffix:
80	A string to add at the end of each document text.
81	:param batch_size:
82	Number of documents to embed at once.
83	:param progress_bar:
84	If `True`, shows a progress bar when embedding documents.
85	:param normalize_embeddings:
86	If `True`, the embeddings are normalized using L2 normalization, so that each embedding has a norm of 1.
87	:param meta_fields_to_embed:
88	List of metadata fields to embed along with the document text.
89	:param embedding_separator:
90	Separator used to concatenate the metadata fields to the document text.
91	:param trust_remote_code:
92	If `False`, allows only Hugging Face verified model architectures.
93	If `True`, allows custom models and scripts.
94	:param local_files_only:
95	If `True`, does not attempt to download the model from Hugging Face Hub and only looks at local files.
96	:param truncate_dim:
97	The dimension to truncate sentence embeddings to. `None` does no truncation.
98	If the model wasn't trained with Matryoshka Representation Learning,
99	truncating embeddings can significantly affect performance.
100	:param model_kwargs:
101	Additional keyword arguments for `AutoModelForSequenceClassification.from_pretrained`
102	when loading the model. Refer to specific model documentation for available kwargs.
103	:param tokenizer_kwargs:
104	Additional keyword arguments for `AutoTokenizer.from_pretrained` when loading the tokenizer.
105	Refer to specific model documentation for available kwargs.
106	:param config_kwargs:
107	Additional keyword arguments for `AutoConfig.from_pretrained` when loading the model configuration.
108	:param precision:
109	The precision to use for the embeddings.
110	All non-float32 precisions are quantized embeddings.
111	Quantized embeddings are smaller and faster to compute, but may have a lower accuracy.
112	They are useful for reducing the size of the embeddings of a corpus for semantic search, among other tasks.
113	:param encode_kwargs:
114	Additional keyword arguments for `SentenceTransformer.encode` when embedding documents.
115	This parameter is provided for fine customization. Be careful not to clash with already set parameters and
116	avoid passing parameters that change the output type.
117	:param backend:
118	The backend to use for the Sentence Transformers model. Choose from "torch", "onnx", or "openvino".
119	Refer to the [Sentence Transformers documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html)
120	for more information on acceleration and quantization options.
121	"""
122
123	self.model = model	1✔
124	self.device = ComponentDevice.resolve_device(device)	1✔
125	self.token = token	1✔
126	self.prefix = prefix	1✔
127	self.suffix = suffix	1✔
128	self.batch_size = batch_size	1✔
129	self.progress_bar = progress_bar	1✔
130	self.normalize_embeddings = normalize_embeddings	1✔
131	self.meta_fields_to_embed = meta_fields_to_embed or []	1✔
132	self.embedding_separator = embedding_separator	1✔
133	self.trust_remote_code = trust_remote_code	1✔
134	self.local_files_only = local_files_only	1✔
135	self.truncate_dim = truncate_dim	1✔
136	self.model_kwargs = model_kwargs	1✔
137	self.tokenizer_kwargs = tokenizer_kwargs	1✔
138	self.config_kwargs = config_kwargs	1✔
139	self.encode_kwargs = encode_kwargs	1✔
140	self.embedding_backend: Optional[_SentenceTransformersEmbeddingBackend] = None	1✔
141	self.precision = precision	1✔
142	self.backend = backend	1✔
143
144	def _get_telemetry_data(self) -> Dict[str, Any]:	1✔
145	"""
146	Data that is sent to Posthog for usage analytics.
147	"""
148	return {"model": self.model}	×
149
150	def to_dict(self) -> Dict[str, Any]:	1✔
151	"""
152	Serializes the component to a dictionary.
153
154	:returns:
155	Dictionary with serialized data.
156	"""
157	serialization_dict = default_to_dict(	1✔
158	self,
159	model=self.model,
160	device=self.device.to_dict(),
161	token=self.token.to_dict() if self.token else None,
162	prefix=self.prefix,
163	suffix=self.suffix,
164	batch_size=self.batch_size,
165	progress_bar=self.progress_bar,
166	normalize_embeddings=self.normalize_embeddings,
167	meta_fields_to_embed=self.meta_fields_to_embed,
168	embedding_separator=self.embedding_separator,
169	trust_remote_code=self.trust_remote_code,
170	local_files_only=self.local_files_only,
171	truncate_dim=self.truncate_dim,
172	model_kwargs=self.model_kwargs,
173	tokenizer_kwargs=self.tokenizer_kwargs,
174	config_kwargs=self.config_kwargs,
175	precision=self.precision,
176	encode_kwargs=self.encode_kwargs,
177	backend=self.backend,
178	)
179	if serialization_dict["init_parameters"].get("model_kwargs") is not None:	1✔
180	serialize_hf_model_kwargs(serialization_dict["init_parameters"]["model_kwargs"])	1✔
181	return serialization_dict	1✔
182
183	@classmethod	1✔
184	def from_dict(cls, data: Dict[str, Any]) -> "SentenceTransformersDocumentEmbedder":	1✔
185	"""
186	Deserializes the component from a dictionary.
187
188	:param data:
189	Dictionary to deserialize from.
190	:returns:
191	Deserialized component.
192	"""
193	init_params = data["init_parameters"]	1✔
194	if init_params.get("device") is not None:	1✔
195	init_params["device"] = ComponentDevice.from_dict(init_params["device"])	1✔
196	deserialize_secrets_inplace(init_params, keys=["token"])	1✔
197	if init_params.get("model_kwargs") is not None:	1✔
198	deserialize_hf_model_kwargs(init_params["model_kwargs"])	1✔
199	return default_from_dict(cls, data)	1✔
200
201	def warm_up(self):	1✔
202	"""
203	Initializes the component.
204	"""
205	if self.embedding_backend is None:	1✔
206	self.embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend(	1✔
207	model=self.model,
208	device=self.device.to_torch_str(),
209	auth_token=self.token,
210	trust_remote_code=self.trust_remote_code,
211	local_files_only=self.local_files_only,
212	truncate_dim=self.truncate_dim,
213	model_kwargs=self.model_kwargs,
214	tokenizer_kwargs=self.tokenizer_kwargs,
215	config_kwargs=self.config_kwargs,
216	backend=self.backend,
217	)
218	if self.tokenizer_kwargs and self.tokenizer_kwargs.get("model_max_length"):	1✔
219	self.embedding_backend.model.max_seq_length = self.tokenizer_kwargs["model_max_length"]	1✔
220
221	@component.output_types(documents=List[Document])	1✔
222	def run(self, documents: List[Document]):	1✔
223	"""
224	Embed a list of documents.
225
226	:param documents:
227	Documents to embed.
228
229	:returns:
230	A dictionary with the following keys:
231	- `documents`: Documents with embeddings.
232	"""
233	if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):	1✔
234	raise TypeError(	1✔
235	"SentenceTransformersDocumentEmbedder expects a list of Documents as input."
236	"In case you want to embed a list of strings, please use the SentenceTransformersTextEmbedder."
237	)
238	if self.embedding_backend is None:	1✔
239	raise RuntimeError("The embedding model has not been loaded. Please call warm_up() before running.")	×
240
241	texts_to_embed = []	1✔
242	for doc in documents:	1✔
243	meta_values_to_embed = [	1✔
244	str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key]
245	]
246	text_to_embed = (	1✔
247	self.prefix + self.embedding_separator.join(meta_values_to_embed + [doc.content or ""]) + self.suffix
248	)
249	texts_to_embed.append(text_to_embed)	1✔
250
251	embeddings = self.embedding_backend.embed(	1✔
252	texts_to_embed,
253	batch_size=self.batch_size,
254	show_progress_bar=self.progress_bar,
255	normalize_embeddings=self.normalize_embeddings,
256	precision=self.precision,
257	**(self.encode_kwargs if self.encode_kwargs else {}),
258	)
259
260	for doc, emb in zip(documents, embeddings):	1✔
261	doc.embedding = emb	1✔
262
263	return {"documents": documents}	1✔

deepset-ai / haystack / 15131674881

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous