16933015230

Committed 13 Aug 2025 09:18AM UTC coverage: 92.184% (+0.2%) from 91.969%

Build # 16933015230

Build Type

Pull #9699

github

Committed by

web-flow

Commit Message

Merge cfbd602e7 into 8160ea8bf

Pull Request Pull Request #9699: feat: Update `source_id_meta_field` in `SentenceWindowRetriever` to also accept a list of values

Run Details

12891 of 13984 relevant lines covered (92.18%)

0.92 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.97

haystack/components/embedders/sentence_transformers_document_embedder.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from dataclasses import replace
from typing import Any, Literal, Optional

from haystack import Document, component, default_from_dict, default_to_dict
from haystack.components.embedders.backends.sentence_transformers_backend import (
    _SentenceTransformersEmbeddingBackend,
    _SentenceTransformersEmbeddingBackendFactory,
)
from haystack.utils import ComponentDevice, Secret, deserialize_secrets_inplace
from haystack.utils.hf import deserialize_hf_model_kwargs, serialize_hf_model_kwargs


@component
class SentenceTransformersDocumentEmbedder:
    """
    Calculates document embeddings using Sentence Transformers models.

    It stores the embeddings in the `embedding` metadata field of each document.
    You can also embed documents' metadata.
    Use this component in indexing pipelines to embed input documents
    and send them to DocumentWriter to write a into a Document Store.

    ### Usage example:

    ```python
    from haystack import Document
    from haystack.components.embedders import SentenceTransformersDocumentEmbedder
    doc = Document(content="I love pizza!")
    doc_embedder = SentenceTransformersDocumentEmbedder()
    doc_embedder.warm_up()

    result = doc_embedder.run([doc])
    print(result['documents'][0].embedding)

    # [-0.07804739475250244, 0.1498992145061493, ...]
    ```
    """

    def __init__(  # noqa: PLR0913 # pylint: disable=too-many-positional-arguments
        self,
        model: str = "sentence-transformers/all-mpnet-base-v2",
        device: Optional[ComponentDevice] = None,
        token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
        prefix: str = "",
        suffix: str = "",
        batch_size: int = 32,
        progress_bar: bool = True,
        normalize_embeddings: bool = False,
        meta_fields_to_embed: Optional[list[str]] = None,
        embedding_separator: str = "\n",
        trust_remote_code: bool = False,
        local_files_only: bool = False,
        truncate_dim: Optional[int] = None,
        model_kwargs: Optional[dict[str, Any]] = None,
        tokenizer_kwargs: Optional[dict[str, Any]] = None,
        config_kwargs: Optional[dict[str, Any]] = None,
        precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32",
        encode_kwargs: Optional[dict[str, Any]] = None,
        backend: Literal["torch", "onnx", "openvino"] = "torch",
    ):
        """
        Creates a SentenceTransformersDocumentEmbedder component.

        :param model:
            The model to use for calculating embeddings.
            Pass a local path or ID of the model on Hugging Face.
        :param device:
            The device to use for loading the model.
            Overrides the default device.
        :param token:
            The API token to download private models from Hugging Face.
        :param prefix:
            A string to add at the beginning of each document text.
            Can be used to prepend the text with an instruction, as required by some embedding models,
            such as E5 and bge.
        :param suffix:
            A string to add at the end of each document text.
        :param batch_size:
            Number of documents to embed at once.
        :param progress_bar:
            If `True`, shows a progress bar when embedding documents.
        :param normalize_embeddings:
            If `True`, the embeddings are normalized using L2 normalization, so that each embedding has a norm of 1.
        :param meta_fields_to_embed:
            List of metadata fields to embed along with the document text.
        :param embedding_separator:
            Separator used to concatenate the metadata fields to the document text.
        :param trust_remote_code:
            If `False`, allows only Hugging Face verified model architectures.
            If `True`, allows custom models and scripts.
        :param local_files_only:
            If `True`, does not attempt to download the model from Hugging Face Hub and only looks at local files.
        :param truncate_dim:
            The dimension to truncate sentence embeddings to. `None` does no truncation.
            If the model wasn't trained with Matryoshka Representation Learning,
            truncating embeddings can significantly affect performance.
        :param model_kwargs:
            Additional keyword arguments for `AutoModelForSequenceClassification.from_pretrained`
            when loading the model. Refer to specific model documentation for available kwargs.
        :param tokenizer_kwargs:
            Additional keyword arguments for `AutoTokenizer.from_pretrained` when loading the tokenizer.
            Refer to specific model documentation for available kwargs.
        :param config_kwargs:
            Additional keyword arguments for `AutoConfig.from_pretrained` when loading the model configuration.
        :param precision:
            The precision to use for the embeddings.
            All non-float32 precisions are quantized embeddings.
            Quantized embeddings are smaller and faster to compute, but may have a lower accuracy.
            They are useful for reducing the size of the embeddings of a corpus for semantic search, among other tasks.
        :param encode_kwargs:
            Additional keyword arguments for `SentenceTransformer.encode` when embedding documents.
            This parameter is provided for fine customization. Be careful not to clash with already set parameters and
            avoid passing parameters that change the output type.
        :param backend:
            The backend to use for the Sentence Transformers model. Choose from "torch", "onnx", or "openvino".
            Refer to the [Sentence Transformers documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html)
            for more information on acceleration and quantization options.
        """

        self.model = model
        self.device = ComponentDevice.resolve_device(device)
        self.token = token
        self.prefix = prefix
        self.suffix = suffix
        self.batch_size = batch_size
        self.progress_bar = progress_bar
        self.normalize_embeddings = normalize_embeddings
        self.meta_fields_to_embed = meta_fields_to_embed or []
        self.embedding_separator = embedding_separator
        self.trust_remote_code = trust_remote_code
        self.local_files_only = local_files_only
        self.truncate_dim = truncate_dim
        self.model_kwargs = model_kwargs
        self.tokenizer_kwargs = tokenizer_kwargs
        self.config_kwargs = config_kwargs
        self.encode_kwargs = encode_kwargs
        self.embedding_backend: Optional[_SentenceTransformersEmbeddingBackend] = None
        self.precision = precision
        self.backend = backend

    def _get_telemetry_data(self) -> dict[str, Any]:
        """
        Data that is sent to Posthog for usage analytics.
        """
        return {"model": self.model}

    def to_dict(self) -> dict[str, Any]:
        """
        Serializes the component to a dictionary.

        :returns:
            Dictionary with serialized data.
        """
        serialization_dict = default_to_dict(
            self,
            model=self.model,
            device=self.device.to_dict(),
            token=self.token.to_dict() if self.token else None,
            prefix=self.prefix,
            suffix=self.suffix,
            batch_size=self.batch_size,
            progress_bar=self.progress_bar,
            normalize_embeddings=self.normalize_embeddings,
            meta_fields_to_embed=self.meta_fields_to_embed,
            embedding_separator=self.embedding_separator,
            trust_remote_code=self.trust_remote_code,
            local_files_only=self.local_files_only,
            truncate_dim=self.truncate_dim,
            model_kwargs=self.model_kwargs,
            tokenizer_kwargs=self.tokenizer_kwargs,
            config_kwargs=self.config_kwargs,
            precision=self.precision,
            encode_kwargs=self.encode_kwargs,
            backend=self.backend,
        )
        if serialization_dict["init_parameters"].get("model_kwargs") is not None:
            serialize_hf_model_kwargs(serialization_dict["init_parameters"]["model_kwargs"])
        return serialization_dict

    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> "SentenceTransformersDocumentEmbedder":
        """
        Deserializes the component from a dictionary.

        :param data:
            Dictionary to deserialize from.
        :returns:
            Deserialized component.
        """
        init_params = data["init_parameters"]
        if init_params.get("device") is not None:
            init_params["device"] = ComponentDevice.from_dict(init_params["device"])
        deserialize_secrets_inplace(init_params, keys=["token"])
        if init_params.get("model_kwargs") is not None:
            deserialize_hf_model_kwargs(init_params["model_kwargs"])
        return default_from_dict(cls, data)

    def warm_up(self):
        """
        Initializes the component.
        """
        if self.embedding_backend is None:
            self.embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend(
                model=self.model,
                device=self.device.to_torch_str(),
                auth_token=self.token,
                trust_remote_code=self.trust_remote_code,
                local_files_only=self.local_files_only,
                truncate_dim=self.truncate_dim,
                model_kwargs=self.model_kwargs,
                tokenizer_kwargs=self.tokenizer_kwargs,
                config_kwargs=self.config_kwargs,
                backend=self.backend,
            )
            if self.tokenizer_kwargs and self.tokenizer_kwargs.get("model_max_length"):
                self.embedding_backend.model.max_seq_length = self.tokenizer_kwargs["model_max_length"]

    @component.output_types(documents=list[Document])
    def run(self, documents: list[Document]):
        """
        Embed a list of documents.

        :param documents:
            Documents to embed.

        :returns:
            A dictionary with the following keys:
            - `documents`: Documents with embeddings.
        """
        if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
            raise TypeError(
                "SentenceTransformersDocumentEmbedder expects a list of Documents as input."
                "In case you want to embed a list of strings, please use the SentenceTransformersTextEmbedder."
            )
        if self.embedding_backend is None:
            raise RuntimeError("The embedding model has not been loaded. Please call warm_up() before running.")

        texts_to_embed = []
        for doc in documents:
            meta_values_to_embed = [
                str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key]
            ]
            text_to_embed = (
                self.prefix + self.embedding_separator.join(meta_values_to_embed + [doc.content or ""]) + self.suffix
            )
            texts_to_embed.append(text_to_embed)

        embeddings = self.embedding_backend.embed(
            texts_to_embed,
            batch_size=self.batch_size,
            show_progress_bar=self.progress_bar,
            normalize_embeddings=self.normalize_embeddings,
            precision=self.precision,
            **(self.encode_kwargs if self.encode_kwargs else {}),
        )

        new_documents = []
        for doc, emb in zip(documents, embeddings):
            new_documents.append(replace(doc, embedding=emb))

        return {"documents": new_documents}

1	# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2	#
3	# SPDX-License-Identifier: Apache-2.0
4
5	from dataclasses import replace	1✔
6	from typing import Any, Literal, Optional	1✔
7
8	from haystack import Document, component, default_from_dict, default_to_dict	1✔
9	from haystack.components.embedders.backends.sentence_transformers_backend import (	1✔
10	_SentenceTransformersEmbeddingBackend,
11	_SentenceTransformersEmbeddingBackendFactory,
12	)
13	from haystack.utils import ComponentDevice, Secret, deserialize_secrets_inplace	1✔
14	from haystack.utils.hf import deserialize_hf_model_kwargs, serialize_hf_model_kwargs	1✔
15
16
17	@component	1✔
18	class SentenceTransformersDocumentEmbedder:	1✔
19	"""
20	Calculates document embeddings using Sentence Transformers models.
21
22	It stores the embeddings in the `embedding` metadata field of each document.
23	You can also embed documents' metadata.
24	Use this component in indexing pipelines to embed input documents
25	and send them to DocumentWriter to write a into a Document Store.
26
27	### Usage example:
28
29	```python
30	from haystack import Document
31	from haystack.components.embedders import SentenceTransformersDocumentEmbedder
32	doc = Document(content="I love pizza!")
33	doc_embedder = SentenceTransformersDocumentEmbedder()
34	doc_embedder.warm_up()
35
36	result = doc_embedder.run([doc])
37	print(result['documents'][0].embedding)
38
39	# [-0.07804739475250244, 0.1498992145061493, ...]
40	```
41	"""
42
43	def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments	1✔
44	self,
45	model: str = "sentence-transformers/all-mpnet-base-v2",
46	device: Optional[ComponentDevice] = None,
47	token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
48	prefix: str = "",
49	suffix: str = "",
50	batch_size: int = 32,
51	progress_bar: bool = True,
52	normalize_embeddings: bool = False,
53	meta_fields_to_embed: Optional[list[str]] = None,
54	embedding_separator: str = "\n",
55	trust_remote_code: bool = False,
56	local_files_only: bool = False,
57	truncate_dim: Optional[int] = None,
58	model_kwargs: Optional[dict[str, Any]] = None,
59	tokenizer_kwargs: Optional[dict[str, Any]] = None,
60	config_kwargs: Optional[dict[str, Any]] = None,
61	precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32",
62	encode_kwargs: Optional[dict[str, Any]] = None,
63	backend: Literal["torch", "onnx", "openvino"] = "torch",
64	):
65	"""
66	Creates a SentenceTransformersDocumentEmbedder component.
67
68	:param model:
69	The model to use for calculating embeddings.
70	Pass a local path or ID of the model on Hugging Face.
71	:param device:
72	The device to use for loading the model.
73	Overrides the default device.
74	:param token:
75	The API token to download private models from Hugging Face.
76	:param prefix:
77	A string to add at the beginning of each document text.
78	Can be used to prepend the text with an instruction, as required by some embedding models,
79	such as E5 and bge.
80	:param suffix:
81	A string to add at the end of each document text.
82	:param batch_size:
83	Number of documents to embed at once.
84	:param progress_bar:
85	If `True`, shows a progress bar when embedding documents.
86	:param normalize_embeddings:
87	If `True`, the embeddings are normalized using L2 normalization, so that each embedding has a norm of 1.
88	:param meta_fields_to_embed:
89	List of metadata fields to embed along with the document text.
90	:param embedding_separator:
91	Separator used to concatenate the metadata fields to the document text.
92	:param trust_remote_code:
93	If `False`, allows only Hugging Face verified model architectures.
94	If `True`, allows custom models and scripts.
95	:param local_files_only:
96	If `True`, does not attempt to download the model from Hugging Face Hub and only looks at local files.
97	:param truncate_dim:
98	The dimension to truncate sentence embeddings to. `None` does no truncation.
99	If the model wasn't trained with Matryoshka Representation Learning,
100	truncating embeddings can significantly affect performance.
101	:param model_kwargs:
102	Additional keyword arguments for `AutoModelForSequenceClassification.from_pretrained`
103	when loading the model. Refer to specific model documentation for available kwargs.
104	:param tokenizer_kwargs:
105	Additional keyword arguments for `AutoTokenizer.from_pretrained` when loading the tokenizer.
106	Refer to specific model documentation for available kwargs.
107	:param config_kwargs:
108	Additional keyword arguments for `AutoConfig.from_pretrained` when loading the model configuration.
109	:param precision:
110	The precision to use for the embeddings.
111	All non-float32 precisions are quantized embeddings.
112	Quantized embeddings are smaller and faster to compute, but may have a lower accuracy.
113	They are useful for reducing the size of the embeddings of a corpus for semantic search, among other tasks.
114	:param encode_kwargs:
115	Additional keyword arguments for `SentenceTransformer.encode` when embedding documents.
116	This parameter is provided for fine customization. Be careful not to clash with already set parameters and
117	avoid passing parameters that change the output type.
118	:param backend:
119	The backend to use for the Sentence Transformers model. Choose from "torch", "onnx", or "openvino".
120	Refer to the [Sentence Transformers documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html)
121	for more information on acceleration and quantization options.
122	"""
123
124	self.model = model	1✔
125	self.device = ComponentDevice.resolve_device(device)	1✔
126	self.token = token	1✔
127	self.prefix = prefix	1✔
128	self.suffix = suffix	1✔
129	self.batch_size = batch_size	1✔
130	self.progress_bar = progress_bar	1✔
131	self.normalize_embeddings = normalize_embeddings	1✔
132	self.meta_fields_to_embed = meta_fields_to_embed or []	1✔
133	self.embedding_separator = embedding_separator	1✔
134	self.trust_remote_code = trust_remote_code	1✔
135	self.local_files_only = local_files_only	1✔
136	self.truncate_dim = truncate_dim	1✔
137	self.model_kwargs = model_kwargs	1✔
138	self.tokenizer_kwargs = tokenizer_kwargs	1✔
139	self.config_kwargs = config_kwargs	1✔
140	self.encode_kwargs = encode_kwargs	1✔
141	self.embedding_backend: Optional[_SentenceTransformersEmbeddingBackend] = None	1✔
142	self.precision = precision	1✔
143	self.backend = backend	1✔
144
145	def _get_telemetry_data(self) -> dict[str, Any]:	1✔
146	"""
147	Data that is sent to Posthog for usage analytics.
148	"""
149	return {"model": self.model}	×
150
151	def to_dict(self) -> dict[str, Any]:	1✔
152	"""
153	Serializes the component to a dictionary.
154
155	:returns:
156	Dictionary with serialized data.
157	"""
158	serialization_dict = default_to_dict(	1✔
159	self,
160	model=self.model,
161	device=self.device.to_dict(),
162	token=self.token.to_dict() if self.token else None,
163	prefix=self.prefix,
164	suffix=self.suffix,
165	batch_size=self.batch_size,
166	progress_bar=self.progress_bar,
167	normalize_embeddings=self.normalize_embeddings,
168	meta_fields_to_embed=self.meta_fields_to_embed,
169	embedding_separator=self.embedding_separator,
170	trust_remote_code=self.trust_remote_code,
171	local_files_only=self.local_files_only,
172	truncate_dim=self.truncate_dim,
173	model_kwargs=self.model_kwargs,
174	tokenizer_kwargs=self.tokenizer_kwargs,
175	config_kwargs=self.config_kwargs,
176	precision=self.precision,
177	encode_kwargs=self.encode_kwargs,
178	backend=self.backend,
179	)
180	if serialization_dict["init_parameters"].get("model_kwargs") is not None:	1✔
181	serialize_hf_model_kwargs(serialization_dict["init_parameters"]["model_kwargs"])	1✔
182	return serialization_dict	1✔
183
184	@classmethod	1✔
185	def from_dict(cls, data: dict[str, Any]) -> "SentenceTransformersDocumentEmbedder":	1✔
186	"""
187	Deserializes the component from a dictionary.
188
189	:param data:
190	Dictionary to deserialize from.
191	:returns:
192	Deserialized component.
193	"""
194	init_params = data["init_parameters"]	1✔
195	if init_params.get("device") is not None:	1✔
196	init_params["device"] = ComponentDevice.from_dict(init_params["device"])	1✔
197	deserialize_secrets_inplace(init_params, keys=["token"])	1✔
198	if init_params.get("model_kwargs") is not None:	1✔
199	deserialize_hf_model_kwargs(init_params["model_kwargs"])	1✔
200	return default_from_dict(cls, data)	1✔
201
202	def warm_up(self):	1✔
203	"""
204	Initializes the component.
205	"""
206	if self.embedding_backend is None:	1✔
207	self.embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend(	1✔
208	model=self.model,
209	device=self.device.to_torch_str(),
210	auth_token=self.token,
211	trust_remote_code=self.trust_remote_code,
212	local_files_only=self.local_files_only,
213	truncate_dim=self.truncate_dim,
214	model_kwargs=self.model_kwargs,
215	tokenizer_kwargs=self.tokenizer_kwargs,
216	config_kwargs=self.config_kwargs,
217	backend=self.backend,
218	)
219	if self.tokenizer_kwargs and self.tokenizer_kwargs.get("model_max_length"):	1✔
220	self.embedding_backend.model.max_seq_length = self.tokenizer_kwargs["model_max_length"]	1✔
221
222	@component.output_types(documents=list[Document])	1✔
223	def run(self, documents: list[Document]):	1✔
224	"""
225	Embed a list of documents.
226
227	:param documents:
228	Documents to embed.
229
230	:returns:
231	A dictionary with the following keys:
232	- `documents`: Documents with embeddings.
233	"""
234	if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):	1✔
235	raise TypeError(	1✔
236	"SentenceTransformersDocumentEmbedder expects a list of Documents as input."
237	"In case you want to embed a list of strings, please use the SentenceTransformersTextEmbedder."
238	)
239	if self.embedding_backend is None:	1✔
240	raise RuntimeError("The embedding model has not been loaded. Please call warm_up() before running.")	×
241
242	texts_to_embed = []	1✔
243	for doc in documents:	1✔
244	meta_values_to_embed = [	1✔
245	str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key]
246	]
247	text_to_embed = (	1✔
248	self.prefix + self.embedding_separator.join(meta_values_to_embed + [doc.content or ""]) + self.suffix
249	)
250	texts_to_embed.append(text_to_embed)	1✔
251
252	embeddings = self.embedding_backend.embed(	1✔
253	texts_to_embed,
254	batch_size=self.batch_size,
255	show_progress_bar=self.progress_bar,
256	normalize_embeddings=self.normalize_embeddings,
257	precision=self.precision,
258	**(self.encode_kwargs if self.encode_kwargs else {}),
259	)
260
261	new_documents = []	1✔
262	for doc, emb in zip(documents, embeddings):	1✔
263	new_documents.append(replace(doc, embedding=emb))	1✔
264
265	return {"documents": new_documents}	1✔

deepset-ai / haystack / 16933015230

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous