• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 13972131258

20 Mar 2025 02:43PM UTC coverage: 90.021% (-0.03%) from 90.054%
13972131258

Pull #9069

github

web-flow
Merge 8371761b0 into 67ab3788e
Pull Request #9069: refactor!: `ChatMessage` serialization-deserialization updates

9833 of 10923 relevant lines covered (90.02%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.59
haystack/components/embedders/hugging_face_api_document_embedder.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
import warnings
1✔
6
from typing import Any, Dict, List, Optional, Union
1✔
7

8
from tqdm import tqdm
1✔
9

10
from haystack import component, default_from_dict, default_to_dict
1✔
11
from haystack.dataclasses import Document
1✔
12
from haystack.lazy_imports import LazyImport
1✔
13
from haystack.utils import Secret, deserialize_secrets_inplace
1✔
14
from haystack.utils.hf import HFEmbeddingAPIType, HFModelType, check_valid_model
1✔
15
from haystack.utils.url_validation import is_valid_http_url
1✔
16

17
with LazyImport(message="Run 'pip install \"huggingface_hub>=0.27.0\"'") as huggingface_hub_import:
1✔
18
    from huggingface_hub import InferenceClient
1✔
19

20

21
@component
1✔
22
class HuggingFaceAPIDocumentEmbedder:
1✔
23
    """
24
    Embeds documents using Hugging Face APIs.
25

26
    Use it with the following Hugging Face APIs:
27
    - [Free Serverless Inference API](https://huggingface.co/inference-api)
28
    - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints)
29
    - [Self-hosted Text Embeddings Inference](https://github.com/huggingface/text-embeddings-inference)
30

31

32
    ### Usage examples
33

34
    #### With free serverless inference API
35

36
    ```python
37
    from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
38
    from haystack.utils import Secret
39
    from haystack.dataclasses import Document
40

41
    doc = Document(content="I love pizza!")
42

43
    doc_embedder = HuggingFaceAPIDocumentEmbedder(api_type="serverless_inference_api",
44
                                                  api_params={"model": "BAAI/bge-small-en-v1.5"},
45
                                                  token=Secret.from_token("<your-api-key>"))
46

47
    result = document_embedder.run([doc])
48
    print(result["documents"][0].embedding)
49

50
    # [0.017020374536514282, -0.023255806416273117, ...]
51
    ```
52

53
    #### With paid inference endpoints
54

55
    ```python
56
    from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
57
    from haystack.utils import Secret
58
    from haystack.dataclasses import Document
59

60
    doc = Document(content="I love pizza!")
61

62
    doc_embedder = HuggingFaceAPIDocumentEmbedder(api_type="inference_endpoints",
63
                                                  api_params={"url": "<your-inference-endpoint-url>"},
64
                                                  token=Secret.from_token("<your-api-key>"))
65

66
    result = document_embedder.run([doc])
67
    print(result["documents"][0].embedding)
68

69
    # [0.017020374536514282, -0.023255806416273117, ...]
70
    ```
71

72
    #### With self-hosted text embeddings inference
73

74
    ```python
75
    from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
76
    from haystack.dataclasses import Document
77

78
    doc = Document(content="I love pizza!")
79

80
    doc_embedder = HuggingFaceAPIDocumentEmbedder(api_type="text_embeddings_inference",
81
                                                  api_params={"url": "http://localhost:8080"})
82

83
    result = document_embedder.run([doc])
84
    print(result["documents"][0].embedding)
85

86
    # [0.017020374536514282, -0.023255806416273117, ...]
87
    ```
88
    """
89

90
    def __init__(
1✔
91
        self,
92
        api_type: Union[HFEmbeddingAPIType, str],
93
        api_params: Dict[str, str],
94
        token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
95
        prefix: str = "",
96
        suffix: str = "",
97
        truncate: Optional[bool] = True,
98
        normalize: Optional[bool] = False,
99
        batch_size: int = 32,
100
        progress_bar: bool = True,
101
        meta_fields_to_embed: Optional[List[str]] = None,
102
        embedding_separator: str = "\n",
103
    ):  # pylint: disable=too-many-positional-arguments
104
        """
105
        Creates a HuggingFaceAPIDocumentEmbedder component.
106

107
        :param api_type:
108
            The type of Hugging Face API to use.
109
        :param api_params:
110
            A dictionary with the following keys:
111
            - `model`: Hugging Face model ID. Required when `api_type` is `SERVERLESS_INFERENCE_API`.
112
            - `url`: URL of the inference endpoint. Required when `api_type` is `INFERENCE_ENDPOINTS` or
113
            `TEXT_EMBEDDINGS_INFERENCE`.
114
        :param token: The Hugging Face token to use as HTTP bearer authorization.
115
            Check your HF token in your [account settings](https://huggingface.co/settings/tokens).
116
        :param prefix:
117
            A string to add at the beginning of each text.
118
        :param suffix:
119
            A string to add at the end of each text.
120
        :param truncate:
121
            Truncates the input text to the maximum length supported by the model.
122
            Applicable when `api_type` is `TEXT_EMBEDDINGS_INFERENCE`, or `INFERENCE_ENDPOINTS`
123
            if the backend uses Text Embeddings Inference.
124
            If `api_type` is `SERVERLESS_INFERENCE_API`, this parameter is ignored.
125
        :param normalize:
126
            Normalizes the embeddings to unit length.
127
            Applicable when `api_type` is `TEXT_EMBEDDINGS_INFERENCE`, or `INFERENCE_ENDPOINTS`
128
            if the backend uses Text Embeddings Inference.
129
            If `api_type` is `SERVERLESS_INFERENCE_API`, this parameter is ignored.
130
        :param batch_size:
131
            Number of documents to process at once.
132
        :param progress_bar:
133
            If `True`, shows a progress bar when running.
134
        :param meta_fields_to_embed:
135
            List of metadata fields to embed along with the document text.
136
        :param embedding_separator:
137
            Separator used to concatenate the metadata fields to the document text.
138
        """
139
        huggingface_hub_import.check()
1✔
140

141
        if isinstance(api_type, str):
1✔
142
            api_type = HFEmbeddingAPIType.from_str(api_type)
1✔
143

144
        api_params = api_params or {}
1✔
145

146
        if api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API:
1✔
147
            model = api_params.get("model")
1✔
148
            if model is None:
1✔
149
                raise ValueError(
1✔
150
                    "To use the Serverless Inference API, you need to specify the `model` parameter in `api_params`."
151
                )
152
            check_valid_model(model, HFModelType.EMBEDDING, token)
1✔
153
            model_or_url = model
1✔
154
        elif api_type in [HFEmbeddingAPIType.INFERENCE_ENDPOINTS, HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE]:
1✔
155
            url = api_params.get("url")
1✔
156
            if url is None:
1✔
157
                msg = (
1✔
158
                    "To use Text Embeddings Inference or Inference Endpoints, you need to specify the `url` "
159
                    "parameter in `api_params`."
160
                )
161
                raise ValueError(msg)
1✔
162
            if not is_valid_http_url(url):
1✔
163
                raise ValueError(f"Invalid URL: {url}")
1✔
164
            model_or_url = url
1✔
165
        else:
166
            msg = f"Unknown api_type {api_type}"
×
167
            raise ValueError(msg)
×
168

169
        self.api_type = api_type
1✔
170
        self.api_params = api_params
1✔
171
        self.token = token
1✔
172
        self.prefix = prefix
1✔
173
        self.suffix = suffix
1✔
174
        self.truncate = truncate
1✔
175
        self.normalize = normalize
1✔
176
        self.batch_size = batch_size
1✔
177
        self.progress_bar = progress_bar
1✔
178
        self.meta_fields_to_embed = meta_fields_to_embed or []
1✔
179
        self.embedding_separator = embedding_separator
1✔
180
        self._client = InferenceClient(model_or_url, token=token.resolve_value() if token else None)
1✔
181

182
    def to_dict(self) -> Dict[str, Any]:
1✔
183
        """
184
        Serializes the component to a dictionary.
185

186
        :returns:
187
            Dictionary with serialized data.
188
        """
189
        return default_to_dict(
1✔
190
            self,
191
            api_type=str(self.api_type),
192
            api_params=self.api_params,
193
            prefix=self.prefix,
194
            suffix=self.suffix,
195
            token=self.token.to_dict() if self.token else None,
196
            truncate=self.truncate,
197
            normalize=self.normalize,
198
            batch_size=self.batch_size,
199
            progress_bar=self.progress_bar,
200
            meta_fields_to_embed=self.meta_fields_to_embed,
201
            embedding_separator=self.embedding_separator,
202
        )
203

204
    @classmethod
1✔
205
    def from_dict(cls, data: Dict[str, Any]) -> "HuggingFaceAPIDocumentEmbedder":
1✔
206
        """
207
        Deserializes the component from a dictionary.
208

209
        :param data:
210
            Dictionary to deserialize from.
211
        :returns:
212
            Deserialized component.
213
        """
214
        deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
1✔
215
        return default_from_dict(cls, data)
1✔
216

217
    def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]:
1✔
218
        """
219
        Prepare the texts to embed by concatenating the Document text with the metadata fields to embed.
220
        """
221
        texts_to_embed = []
1✔
222
        for doc in documents:
1✔
223
            meta_values_to_embed = [
1✔
224
                str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key] is not None
225
            ]
226

227
            text_to_embed = (
1✔
228
                self.prefix + self.embedding_separator.join(meta_values_to_embed + [doc.content or ""]) + self.suffix
229
            )
230

231
            texts_to_embed.append(text_to_embed)
1✔
232
        return texts_to_embed
1✔
233

234
    def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> List[List[float]]:
1✔
235
        """
236
        Embed a list of texts in batches.
237
        """
238
        truncate = self.truncate
1✔
239
        normalize = self.normalize
1✔
240

241
        if self.api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API:
1✔
242
            if truncate is not None:
1✔
243
                msg = "`truncate` parameter is not supported for Serverless Inference API. It will be ignored."
1✔
244
                warnings.warn(msg)
1✔
245
                truncate = None
1✔
246
            if normalize is not None:
1✔
247
                msg = "`normalize` parameter is not supported for Serverless Inference API. It will be ignored."
1✔
248
                warnings.warn(msg)
1✔
249
                normalize = None
1✔
250

251
        all_embeddings = []
1✔
252
        for i in tqdm(
1✔
253
            range(0, len(texts_to_embed), batch_size), disable=not self.progress_bar, desc="Calculating embeddings"
254
        ):
255
            batch = texts_to_embed[i : i + batch_size]
1✔
256

257
            np_embeddings = self._client.feature_extraction(
1✔
258
                # this method does not officially support list of strings, but works as expected
259
                text=batch,  # type: ignore[arg-type]
260
                truncate=truncate,
261
                normalize=normalize,
262
            )
263

264
            if np_embeddings.ndim != 2 or np_embeddings.shape[0] != len(batch):
1✔
265
                raise ValueError(f"Expected embedding shape ({batch_size}, embedding_dim), got {np_embeddings.shape}")
1✔
266

267
            all_embeddings.extend(np_embeddings.tolist())
1✔
268

269
        return all_embeddings
1✔
270

271
    @component.output_types(documents=List[Document])
1✔
272
    def run(self, documents: List[Document]):
1✔
273
        """
274
        Embeds a list of documents.
275

276
        :param documents:
277
            Documents to embed.
278

279
        :returns:
280
            A dictionary with the following keys:
281
            - `documents`: A list of documents with embeddings.
282
        """
283
        if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
1✔
284
            raise TypeError(
×
285
                "HuggingFaceAPIDocumentEmbedder expects a list of Documents as input."
286
                " In case you want to embed a string, please use the HuggingFaceAPITextEmbedder."
287
            )
288

289
        texts_to_embed = self._prepare_texts_to_embed(documents=documents)
1✔
290

291
        embeddings = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self.batch_size)
1✔
292

293
        for doc, emb in zip(documents, embeddings):
1✔
294
            doc.embedding = emb
1✔
295

296
        return {"documents": documents}
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc