• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 16564070890

28 Jul 2025 08:26AM UTC coverage: 91.926% (+1.1%) from 90.802%
16564070890

push

github

web-flow
fix(embeddings): add `encoding_format` keyword argument when calling OpenAI's `client.embeddings.create` (#9655)

* fix(embeddings): add `encoding_format` keyword argument when calling OpenAI's `client.embeddings.create`.

* fix mypy

---------

Co-authored-by: anakin87 <stefanofiorucci@gmail.com>

12785 of 13908 relevant lines covered (91.93%)

0.92 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.51
haystack/components/generators/hugging_face_api.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
from dataclasses import asdict
1✔
6
from datetime import datetime
1✔
7
from typing import Any, Dict, Iterable, List, Optional, Union, cast
1✔
8

9
from haystack import component, default_from_dict, default_to_dict, logging
1✔
10
from haystack.dataclasses import (
1✔
11
    ComponentInfo,
12
    FinishReason,
13
    StreamingCallbackT,
14
    StreamingChunk,
15
    SyncStreamingCallbackT,
16
    select_streaming_callback,
17
)
18
from haystack.lazy_imports import LazyImport
1✔
19
from haystack.utils import Secret, deserialize_callable, deserialize_secrets_inplace, serialize_callable
1✔
20
from haystack.utils.hf import HFGenerationAPIType, HFModelType, check_valid_model
1✔
21
from haystack.utils.url_validation import is_valid_http_url
1✔
22

23
with LazyImport(message="Run 'pip install \"huggingface_hub>=0.27.0\"'") as huggingface_hub_import:
1✔
24
    from huggingface_hub import (
1✔
25
        InferenceClient,
26
        TextGenerationOutput,
27
        TextGenerationStreamOutput,
28
        TextGenerationStreamOutputToken,
29
    )
30

31

32
logger = logging.getLogger(__name__)
1✔
33

34

35
@component
1✔
36
class HuggingFaceAPIGenerator:
1✔
37
    """
38
    Generates text using Hugging Face APIs.
39

40
    Use it with the following Hugging Face APIs:
41
    - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints)
42
    - [Self-hosted Text Generation Inference](https://github.com/huggingface/text-generation-inference)
43

44
    **Note:** As of July 2025, the Hugging Face Inference API no longer offers generative models through the
45
    `text_generation` endpoint. Generative models are now only available through providers supporting the
46
    `chat_completion` endpoint. As a result, this component might no longer work with the Hugging Face Inference API.
47
    Use the `HuggingFaceAPIChatGenerator` component, which supports the `chat_completion` endpoint.
48

49
    ### Usage examples
50

51
    #### With Hugging Face Inference Endpoints
52

53
    ```python
54
    from haystack.components.generators import HuggingFaceAPIGenerator
55
    from haystack.utils import Secret
56

57
    generator = HuggingFaceAPIGenerator(api_type="inference_endpoints",
58
                                        api_params={"url": "<your-inference-endpoint-url>"},
59
                                        token=Secret.from_token("<your-api-key>"))
60

61
    result = generator.run(prompt="What's Natural Language Processing?")
62
    print(result)
63
    ```
64

65
    #### With self-hosted text generation inference
66
    ```python
67
    from haystack.components.generators import HuggingFaceAPIGenerator
68

69
    generator = HuggingFaceAPIGenerator(api_type="text_generation_inference",
70
                                        api_params={"url": "http://localhost:8080"})
71

72
    result = generator.run(prompt="What's Natural Language Processing?")
73
    print(result)
74
    ```
75

76
    #### With the free serverless inference API
77

78
    Be aware that this example might not work as the Hugging Face Inference API no longer offer models that support the
79
    `text_generation` endpoint. Use the `HuggingFaceAPIChatGenerator` for generative models through the
80
    `chat_completion` endpoint.
81

82
    ```python
83
    from haystack.components.generators import HuggingFaceAPIGenerator
84
    from haystack.utils import Secret
85

86
    generator = HuggingFaceAPIGenerator(api_type="serverless_inference_api",
87
                                        api_params={"model": "HuggingFaceH4/zephyr-7b-beta"},
88
                                        token=Secret.from_token("<your-api-key>"))
89

90
    result = generator.run(prompt="What's Natural Language Processing?")
91
    print(result)
92
    ```
93
    """
94

95
    def __init__(  # pylint: disable=too-many-positional-arguments
1✔
96
        self,
97
        api_type: Union[HFGenerationAPIType, str],
98
        api_params: Dict[str, str],
99
        token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
100
        generation_kwargs: Optional[Dict[str, Any]] = None,
101
        stop_words: Optional[List[str]] = None,
102
        streaming_callback: Optional[StreamingCallbackT] = None,
103
    ):
104
        """
105
        Initialize the HuggingFaceAPIGenerator instance.
106

107
        :param api_type:
108
            The type of Hugging Face API to use. Available types:
109
            - `text_generation_inference`: See [TGI](https://github.com/huggingface/text-generation-inference).
110
            - `inference_endpoints`: See [Inference Endpoints](https://huggingface.co/inference-endpoints).
111
            - `serverless_inference_api`: See [Serverless Inference API](https://huggingface.co/inference-api).
112
              This might no longer work due to changes in the models offered in the Hugging Face Inference API.
113
              Please use the `HuggingFaceAPIChatGenerator` component instead.
114
        :param api_params:
115
            A dictionary with the following keys:
116
            - `model`: Hugging Face model ID. Required when `api_type` is `SERVERLESS_INFERENCE_API`.
117
            - `url`: URL of the inference endpoint. Required when `api_type` is `INFERENCE_ENDPOINTS` or
118
            `TEXT_GENERATION_INFERENCE`.
119
            - Other parameters specific to the chosen API type, such as `timeout`, `headers`, `provider` etc.
120
        :param token: The Hugging Face token to use as HTTP bearer authorization.
121
            Check your HF token in your [account settings](https://huggingface.co/settings/tokens).
122
        :param generation_kwargs:
123
            A dictionary with keyword arguments to customize text generation. Some examples: `max_new_tokens`,
124
            `temperature`, `top_k`, `top_p`.
125
            For details, see [Hugging Face documentation](https://huggingface.co/docs/huggingface_hub/en/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation)
126
            for more information.
127
        :param stop_words: An optional list of strings representing the stop words.
128
        :param streaming_callback: An optional callable for handling streaming responses.
129
        """
130

131
        huggingface_hub_import.check()
1✔
132

133
        if isinstance(api_type, str):
1✔
134
            api_type = HFGenerationAPIType.from_str(api_type)
1✔
135

136
        if api_type == HFGenerationAPIType.SERVERLESS_INFERENCE_API:
1✔
137
            logger.warning(
1✔
138
                "Due to changes in the models offered in Hugging Face Inference API, using this component with the "
139
                "Serverless Inference API might no longer work. "
140
                "Please use the `HuggingFaceAPIChatGenerator` component instead."
141
            )
142
            model = api_params.get("model")
1✔
143
            if model is None:
1✔
144
                raise ValueError(
1✔
145
                    "To use the Serverless Inference API, you need to specify the `model` parameter in `api_params`."
146
                )
147
            check_valid_model(model, HFModelType.GENERATION, token)
1✔
148
            model_or_url = model
1✔
149
        elif api_type in [HFGenerationAPIType.INFERENCE_ENDPOINTS, HFGenerationAPIType.TEXT_GENERATION_INFERENCE]:
1✔
150
            url = api_params.get("url")
1✔
151
            if url is None:
1✔
152
                msg = (
1✔
153
                    "To use Text Generation Inference or Inference Endpoints, you need to specify the `url` "
154
                    "parameter in `api_params`."
155
                )
156
                raise ValueError(msg)
1✔
157
            if not is_valid_http_url(url):
1✔
158
                raise ValueError(f"Invalid URL: {url}")
1✔
159
            model_or_url = url
1✔
160
        else:
161
            msg = f"Unknown api_type {api_type}"
×
162
            raise ValueError(msg)
×
163

164
        # handle generation kwargs setup
165
        generation_kwargs = generation_kwargs.copy() if generation_kwargs else {}
1✔
166
        generation_kwargs["stop_sequences"] = generation_kwargs.get("stop_sequences", [])
1✔
167
        generation_kwargs["stop_sequences"].extend(stop_words or [])
1✔
168
        generation_kwargs.setdefault("max_new_tokens", 512)
1✔
169

170
        self.api_type = api_type
1✔
171
        self.api_params = api_params
1✔
172
        self.token = token
1✔
173
        self.generation_kwargs = generation_kwargs
1✔
174
        self.streaming_callback = streaming_callback
1✔
175

176
        resolved_api_params: Dict[str, Any] = {k: v for k, v in api_params.items() if k != "model" and k != "url"}
1✔
177
        self._client = InferenceClient(
1✔
178
            model_or_url, token=token.resolve_value() if token else None, **resolved_api_params
179
        )
180

181
    def to_dict(self) -> Dict[str, Any]:
1✔
182
        """
183
        Serialize this component to a dictionary.
184

185
        :returns:
186
            A dictionary containing the serialized component.
187
        """
188
        callback_name = serialize_callable(self.streaming_callback) if self.streaming_callback else None
1✔
189
        return default_to_dict(
1✔
190
            self,
191
            api_type=str(self.api_type),
192
            api_params=self.api_params,
193
            token=self.token.to_dict() if self.token else None,
194
            generation_kwargs=self.generation_kwargs,
195
            streaming_callback=callback_name,
196
        )
197

198
    @classmethod
1✔
199
    def from_dict(cls, data: Dict[str, Any]) -> "HuggingFaceAPIGenerator":
1✔
200
        """
201
        Deserialize this component from a dictionary.
202
        """
203
        deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
1✔
204
        init_params = data["init_parameters"]
1✔
205
        serialized_callback_handler = init_params.get("streaming_callback")
1✔
206
        if serialized_callback_handler:
1✔
207
            init_params["streaming_callback"] = deserialize_callable(serialized_callback_handler)
1✔
208
        return default_from_dict(cls, data)
1✔
209

210
    @component.output_types(replies=List[str], meta=List[Dict[str, Any]])
1✔
211
    def run(
1✔
212
        self,
213
        prompt: str,
214
        streaming_callback: Optional[StreamingCallbackT] = None,
215
        generation_kwargs: Optional[Dict[str, Any]] = None,
216
    ):
217
        """
218
        Invoke the text generation inference for the given prompt and generation parameters.
219

220
        :param prompt:
221
            A string representing the prompt.
222
        :param streaming_callback:
223
            A callback function that is called when a new token is received from the stream.
224
        :param generation_kwargs:
225
            Additional keyword arguments for text generation.
226
        :returns:
227
            A dictionary with the generated replies and metadata. Both are lists of length n.
228
            - replies: A list of strings representing the generated replies.
229
        """
230
        # update generation kwargs by merging with the default ones
231
        generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
1✔
232

233
        # check if streaming_callback is passed
234
        streaming_callback = select_streaming_callback(
1✔
235
            init_callback=self.streaming_callback, runtime_callback=streaming_callback, requires_async=False
236
        )
237

238
        hf_output = self._client.text_generation(
1✔
239
            prompt, details=True, stream=streaming_callback is not None, **generation_kwargs
240
        )
241

242
        if streaming_callback is not None:
1✔
243
            # mypy doesn't know that hf_output is a Iterable[TextGenerationStreamOutput], so we cast it
244
            return self._stream_and_build_response(
1✔
245
                hf_output=cast(Iterable[TextGenerationStreamOutput], hf_output), streaming_callback=streaming_callback
246
            )
247

248
        # mypy doesn't know that hf_output is a TextGenerationOutput, so we cast it
249
        return self._build_non_streaming_response(cast(TextGenerationOutput, hf_output))
1✔
250

251
    def _stream_and_build_response(
1✔
252
        self, hf_output: Iterable["TextGenerationStreamOutput"], streaming_callback: SyncStreamingCallbackT
253
    ):
254
        chunks: List[StreamingChunk] = []
1✔
255
        first_chunk_time = None
1✔
256

257
        component_info = ComponentInfo.from_component(self)
1✔
258
        for chunk in hf_output:
1✔
259
            token: TextGenerationStreamOutputToken = chunk.token
1✔
260
            if token.special:
1✔
261
                continue
×
262

263
            chunk_metadata = {**asdict(token), **(asdict(chunk.details) if chunk.details else {})}
1✔
264
            if first_chunk_time is None:
1✔
265
                first_chunk_time = datetime.now().isoformat()
1✔
266

267
            mapping: Dict[str, FinishReason] = {
1✔
268
                "length": "length",  # Direct match
269
                "eos_token": "stop",  # EOS token means natural stop
270
                "stop_sequence": "stop",  # Stop sequence means natural stop
271
            }
272
            mapped_finish_reason = (
1✔
273
                mapping.get(chunk_metadata["finish_reason"], "stop") if chunk_metadata.get("finish_reason") else None
274
            )
275
            stream_chunk = StreamingChunk(
1✔
276
                content=token.text,
277
                meta=chunk_metadata,
278
                component_info=component_info,
279
                index=0,
280
                start=len(chunks) == 0,
281
                finish_reason=mapped_finish_reason,
282
            )
283
            chunks.append(stream_chunk)
1✔
284
            streaming_callback(stream_chunk)
1✔
285

286
        metadata = {
1✔
287
            "finish_reason": chunks[-1].meta.get("finish_reason", None),
288
            "model": self._client.model,
289
            "usage": {"completion_tokens": chunks[-1].meta.get("generated_tokens", 0)},
290
            "completion_start_time": first_chunk_time,
291
        }
292
        return {"replies": ["".join([chunk.content for chunk in chunks])], "meta": [metadata]}
1✔
293

294
    def _build_non_streaming_response(self, hf_output: "TextGenerationOutput"):
1✔
295
        meta = [
1✔
296
            {
297
                "model": self._client.model,
298
                "finish_reason": hf_output.details.finish_reason if hf_output.details else None,
299
                "usage": {"completion_tokens": len(hf_output.details.tokens) if hf_output.details else 0},
300
            }
301
        ]
302
        return {"replies": [hf_output.generated_text], "meta": meta}
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc