• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 9568249476

18 Jun 2024 03:52PM UTC coverage: 89.872% (-0.1%) from 89.995%
9568249476

push

github

web-flow
ci: Add code formatting checks  (#7882)

* ruff settings

enable ruff format and re-format outdated files

feat: `EvaluationRunResult` add parameter to specify columns to keep in the comparative `Dataframe`  (#7879)

* adding param to explictily state which cols to keep

* adding param to explictily state which cols to keep

* adding param to explictily state which cols to keep

* updating tests

* adding release notes

* Update haystack/evaluation/eval_run_result.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update releasenotes/notes/add-keep-columns-to-EvalRunResult-comparative-be3e15ce45de3e0b.yaml

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* updating docstring

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

add format-check

fail on format and linting failures

fix string formatting

reformat long lines

fix tests

fix typing

linter

pull from main

* reformat

* lint -> check

* lint -> check

6957 of 7741 relevant lines covered (89.87%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.1
haystack/components/embedders/hugging_face_api_document_embedder.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
import json
1✔
6
from typing import Any, Dict, List, Optional, Union
1✔
7

8
from tqdm import tqdm
1✔
9

10
from haystack import component, default_from_dict, default_to_dict, logging
1✔
11
from haystack.dataclasses import Document
1✔
12
from haystack.lazy_imports import LazyImport
1✔
13
from haystack.utils import Secret, deserialize_secrets_inplace
1✔
14
from haystack.utils.hf import HFEmbeddingAPIType, HFModelType, check_valid_model
1✔
15
from haystack.utils.url_validation import is_valid_http_url
1✔
16

17
with LazyImport(message="Run 'pip install \"huggingface_hub>=0.23.0\"'") as huggingface_hub_import:
1✔
18
    from huggingface_hub import InferenceClient
1✔
19

20
logger = logging.getLogger(__name__)
1✔
21

22

23
@component
1✔
24
class HuggingFaceAPIDocumentEmbedder:
1✔
25
    """
26
    A component that embeds documents using Hugging Face APIs.
27

28
    This component can be used to compute Document embeddings using different Hugging Face APIs:
29
    - [Free Serverless Inference API]((https://huggingface.co/inference-api)
30
    - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints)
31
    - [Self-hosted Text Embeddings Inference](https://github.com/huggingface/text-embeddings-inference)
32

33

34
    Example usage with the free Serverless Inference API:
35
    ```python
36
    from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
37
    from haystack.utils import Secret
38
    from haystack.dataclasses import Document
39

40
    doc = Document(content="I love pizza!")
41

42
    doc_embedder = HuggingFaceAPIDocumentEmbedder(api_type="serverless_inference_api",
43
                                                  api_params={"model": "BAAI/bge-small-en-v1.5"},
44
                                                  token=Secret.from_token("<your-api-key>"))
45

46
    result = document_embedder.run([doc])
47
    print(result["documents"][0].embedding)
48

49
    # [0.017020374536514282, -0.023255806416273117, ...]
50
    ```
51

52
    Example usage with paid Inference Endpoints:
53
    ```python
54
    from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
55
    from haystack.utils import Secret
56
    from haystack.dataclasses import Document
57

58
    doc = Document(content="I love pizza!")
59

60
    doc_embedder = HuggingFaceAPIDocumentEmbedder(api_type="inference_endpoints",
61
                                                  api_params={"url": "<your-inference-endpoint-url>"},
62
                                                  token=Secret.from_token("<your-api-key>"))
63

64
    result = document_embedder.run([doc])
65
    print(result["documents"][0].embedding)
66

67
    # [0.017020374536514282, -0.023255806416273117, ...]
68
    ```
69

70
    Example usage with self-hosted Text Embeddings Inference:
71
    ```python
72
    from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
73
    from haystack.dataclasses import Document
74

75
    doc = Document(content="I love pizza!")
76

77
    doc_embedder = HuggingFaceAPIDocumentEmbedder(api_type="text_embeddings_inference",
78
                                                  api_params={"url": "http://localhost:8080"})
79

80
    result = document_embedder.run([doc])
81
    print(result["documents"][0].embedding)
82

83
    # [0.017020374536514282, -0.023255806416273117, ...]
84
    ```
85
    """
86

87
    def __init__(
1✔
88
        self,
89
        api_type: Union[HFEmbeddingAPIType, str],
90
        api_params: Dict[str, str],
91
        token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False),
92
        prefix: str = "",
93
        suffix: str = "",
94
        truncate: bool = True,
95
        normalize: bool = False,
96
        batch_size: int = 32,
97
        progress_bar: bool = True,
98
        meta_fields_to_embed: Optional[List[str]] = None,
99
        embedding_separator: str = "\n",
100
    ):
101
        """
102
        Create an HuggingFaceAPITextEmbedder component.
103

104
        :param api_type:
105
            The type of Hugging Face API to use.
106
        :param api_params:
107
            A dictionary containing the following keys:
108
            - `model`: model ID on the Hugging Face Hub. Required when `api_type` is `SERVERLESS_INFERENCE_API`.
109
            - `url`: URL of the inference endpoint. Required when `api_type` is `INFERENCE_ENDPOINTS` or
110
                `TEXT_EMBEDDINGS_INFERENCE`.
111
        :param token: The HuggingFace token to use as HTTP bearer authorization.
112
            You can find your HF token in your [account settings](https://huggingface.co/settings/tokens).
113
        :param prefix:
114
            A string to add at the beginning of each text.
115
        :param suffix:
116
            A string to add at the end of each text.
117
        :param truncate:
118
            Truncate input text from the end to the maximum length supported by the model.
119
            This parameter takes effect when the `api_type` is `TEXT_EMBEDDINGS_INFERENCE`.
120
            It also takes effect when the `api_type` is `INFERENCE_ENDPOINTS` and the backend is based on Text
121
            Embeddings Inference. This parameter is ignored when the `api_type` is `SERVERLESS_INFERENCE_API`
122
            (it is always set to `True` and cannot be changed).
123
        :param normalize:
124
            Normalize the embeddings to unit length.
125
            This parameter takes effect when the `api_type` is `TEXT_EMBEDDINGS_INFERENCE`.
126
            It also takes effect when the `api_type` is `INFERENCE_ENDPOINTS` and the backend is based on Text
127
            Embeddings Inference. This parameter is ignored when the `api_type` is `SERVERLESS_INFERENCE_API`
128
            (it is always set to `False` and cannot be changed).
129
        :param batch_size:
130
            Number of Documents to process at once.
131
        :param progress_bar:
132
            If `True` shows a progress bar when running.
133
        :param meta_fields_to_embed:
134
            List of meta fields that will be embedded along with the Document text.
135
        :param embedding_separator:
136
            Separator used to concatenate the meta fields to the Document text.
137
        """
138
        huggingface_hub_import.check()
1✔
139

140
        if isinstance(api_type, str):
1✔
141
            api_type = HFEmbeddingAPIType.from_str(api_type)
1✔
142

143
        api_params = api_params or {}
1✔
144

145
        if api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API:
1✔
146
            model = api_params.get("model")
1✔
147
            if model is None:
1✔
148
                raise ValueError(
1✔
149
                    "To use the Serverless Inference API, you need to specify the `model` parameter in `api_params`."
150
                )
151
            check_valid_model(model, HFModelType.EMBEDDING, token)
1✔
152
            model_or_url = model
1✔
153
        elif api_type in [HFEmbeddingAPIType.INFERENCE_ENDPOINTS, HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE]:
1✔
154
            url = api_params.get("url")
1✔
155
            if url is None:
1✔
156
                msg = (
1✔
157
                    "To use Text Embeddings Inference or Inference Endpoints, you need to specify the `url` "
158
                    "parameter in `api_params`."
159
                )
160
                raise ValueError(msg)
1✔
161
            if not is_valid_http_url(url):
1✔
162
                raise ValueError(f"Invalid URL: {url}")
1✔
163
            model_or_url = url
1✔
164
        else:
165
            msg = f"Unknown api_type {api_type}"
×
166
            raise ValueError(api_type)
×
167

168
        self.api_type = api_type
1✔
169
        self.api_params = api_params
1✔
170
        self.token = token
1✔
171
        self.prefix = prefix
1✔
172
        self.suffix = suffix
1✔
173
        self.truncate = truncate
1✔
174
        self.normalize = normalize
1✔
175
        self.batch_size = batch_size
1✔
176
        self.progress_bar = progress_bar
1✔
177
        self.meta_fields_to_embed = meta_fields_to_embed or []
1✔
178
        self.embedding_separator = embedding_separator
1✔
179
        self._client = InferenceClient(model_or_url, token=token.resolve_value() if token else None)
1✔
180

181
    def to_dict(self) -> Dict[str, Any]:
1✔
182
        """
183
        Serializes the component to a dictionary.
184

185
        :returns:
186
            Dictionary with serialized data.
187
        """
188
        return default_to_dict(
1✔
189
            self,
190
            api_type=str(self.api_type),
191
            api_params=self.api_params,
192
            prefix=self.prefix,
193
            suffix=self.suffix,
194
            token=self.token.to_dict() if self.token else None,
195
            truncate=self.truncate,
196
            normalize=self.normalize,
197
            batch_size=self.batch_size,
198
            progress_bar=self.progress_bar,
199
            meta_fields_to_embed=self.meta_fields_to_embed,
200
            embedding_separator=self.embedding_separator,
201
        )
202

203
    @classmethod
1✔
204
    def from_dict(cls, data: Dict[str, Any]) -> "HuggingFaceAPIDocumentEmbedder":
1✔
205
        """
206
        Deserializes the component from a dictionary.
207

208
        :param data:
209
            Dictionary to deserialize from.
210
        :returns:
211
            Deserialized component.
212
        """
213
        deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
1✔
214
        return default_from_dict(cls, data)
1✔
215

216
    def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]:
1✔
217
        """
218
        Prepare the texts to embed by concatenating the Document text with the metadata fields to embed.
219
        """
220
        texts_to_embed = []
1✔
221
        for doc in documents:
1✔
222
            meta_values_to_embed = [
1✔
223
                str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key] is not None
224
            ]
225

226
            text_to_embed = (
1✔
227
                self.prefix + self.embedding_separator.join(meta_values_to_embed + [doc.content or ""]) + self.suffix
228
            )
229

230
            texts_to_embed.append(text_to_embed)
1✔
231
        return texts_to_embed
1✔
232

233
    def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> List[List[float]]:
1✔
234
        """
235
        Embed a list of texts in batches.
236
        """
237

238
        all_embeddings = []
1✔
239
        for i in tqdm(
1✔
240
            range(0, len(texts_to_embed), batch_size), disable=not self.progress_bar, desc="Calculating embeddings"
241
        ):
242
            batch = texts_to_embed[i : i + batch_size]
1✔
243
            response = self._client.post(
1✔
244
                json={"inputs": batch, "truncate": self.truncate, "normalize": self.normalize},
245
                task="feature-extraction",
246
            )
247
            embeddings = json.loads(response.decode())
1✔
248
            all_embeddings.extend(embeddings)
1✔
249

250
        return all_embeddings
1✔
251

252
    @component.output_types(documents=List[Document])
1✔
253
    def run(self, documents: List[Document]):
1✔
254
        """
255
        Embed a list of Documents.
256

257
        :param documents:
258
            Documents to embed.
259

260
        :returns:
261
            A dictionary with the following keys:
262
            - `documents`: Documents with embeddings
263
        """
264
        if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
1✔
265
            raise TypeError(
×
266
                "HuggingFaceAPIDocumentEmbedder expects a list of Documents as input."
267
                " In case you want to embed a string, please use the HuggingFaceAPITextEmbedder."
268
            )
269

270
        texts_to_embed = self._prepare_texts_to_embed(documents=documents)
1✔
271

272
        embeddings = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self.batch_size)
1✔
273

274
        for doc, emb in zip(documents, embeddings):
1✔
275
            doc.embedding = emb
1✔
276

277
        return {"documents": documents}
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc