• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 16933015230

13 Aug 2025 09:18AM UTC coverage: 92.184% (+0.2%) from 91.969%
16933015230

Pull #9699

github

web-flow
Merge cfbd602e7 into 8160ea8bf
Pull Request #9699: feat: Update `source_id_meta_field` in `SentenceWindowRetriever` to also accept a list of values

12891 of 13984 relevant lines covered (92.18%)

0.92 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.41
haystack/components/generators/chat/hugging_face_api.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
import json
1✔
6
from datetime import datetime
1✔
7
from typing import Any, AsyncIterable, Iterable, Optional, Union
1✔
8

9
from haystack import component, default_from_dict, default_to_dict, logging
1✔
10
from haystack.components.generators.utils import _convert_streaming_chunks_to_chat_message
1✔
11
from haystack.dataclasses import (
1✔
12
    AsyncStreamingCallbackT,
13
    ChatMessage,
14
    ComponentInfo,
15
    StreamingCallbackT,
16
    StreamingChunk,
17
    SyncStreamingCallbackT,
18
    ToolCall,
19
    select_streaming_callback,
20
)
21
from haystack.dataclasses.streaming_chunk import FinishReason
1✔
22
from haystack.lazy_imports import LazyImport
1✔
23
from haystack.tools import (
1✔
24
    Tool,
25
    Toolset,
26
    _check_duplicate_tool_names,
27
    deserialize_tools_or_toolset_inplace,
28
    serialize_tools_or_toolset,
29
)
30
from haystack.utils import Secret, deserialize_callable, deserialize_secrets_inplace, serialize_callable
1✔
31
from haystack.utils.hf import HFGenerationAPIType, HFModelType, check_valid_model, convert_message_to_hf_format
1✔
32
from haystack.utils.url_validation import is_valid_http_url
1✔
33

34
logger = logging.getLogger(__name__)
1✔
35

36
with LazyImport(message="Run 'pip install \"huggingface_hub[inference]>=0.27.0\"'") as huggingface_hub_import:
1✔
37
    from huggingface_hub import (
1✔
38
        AsyncInferenceClient,
39
        ChatCompletionInputFunctionDefinition,
40
        ChatCompletionInputStreamOptions,
41
        ChatCompletionInputTool,
42
        ChatCompletionOutput,
43
        ChatCompletionOutputComplete,
44
        ChatCompletionOutputToolCall,
45
        ChatCompletionStreamOutput,
46
        ChatCompletionStreamOutputChoice,
47
        InferenceClient,
48
    )
49

50

51
def _convert_hfapi_tool_calls(hfapi_tool_calls: Optional[list["ChatCompletionOutputToolCall"]]) -> list[ToolCall]:
1✔
52
    """
53
    Convert HuggingFace API tool calls to a list of Haystack ToolCall.
54

55
    :param hfapi_tool_calls: The HuggingFace API tool calls to convert.
56
    :returns: A list of ToolCall objects.
57

58
    """
59
    if not hfapi_tool_calls:
1✔
60
        return []
1✔
61

62
    tool_calls = []
1✔
63

64
    for hfapi_tc in hfapi_tool_calls:
1✔
65
        hf_arguments = hfapi_tc.function.arguments
1✔
66

67
        arguments = None
1✔
68
        if isinstance(hf_arguments, dict):
1✔
69
            arguments = hf_arguments
1✔
70
        elif isinstance(hf_arguments, str):
1✔
71
            try:
1✔
72
                arguments = json.loads(hf_arguments)
1✔
73
            except json.JSONDecodeError:
1✔
74
                logger.warning(
1✔
75
                    "HuggingFace API returned a malformed JSON string for tool call arguments. This tool call "
76
                    "will be skipped. Tool call ID: {_id}, Tool name: {_name}, Arguments: {_arguments}",
77
                    _id=hfapi_tc.id,
78
                    _name=hfapi_tc.function.name,
79
                    _arguments=hf_arguments,
80
                )
81
        else:
82
            logger.warning(
1✔
83
                "HuggingFace API returned tool call arguments of type {_type}. Valid types are dict and str. This tool "
84
                "call will be skipped. Tool call ID: {_id}, Tool name: {_name}, Arguments: {_arguments}",
85
                _id=hfapi_tc.id,
86
                _name=hfapi_tc.function.name,
87
                _arguments=hf_arguments,
88
            )
89

90
        if arguments:
1✔
91
            tool_calls.append(ToolCall(tool_name=hfapi_tc.function.name, arguments=arguments, id=hfapi_tc.id))
1✔
92

93
    return tool_calls
1✔
94

95

96
def _convert_tools_to_hfapi_tools(
1✔
97
    tools: Optional[Union[list[Tool], Toolset]],
98
) -> Optional[list["ChatCompletionInputTool"]]:
99
    if not tools:
1✔
100
        return None
1✔
101

102
    # huggingface_hub<0.31.0 uses "arguments", huggingface_hub>=0.31.0 uses "parameters"
103
    parameters_name = "arguments" if hasattr(ChatCompletionInputFunctionDefinition, "arguments") else "parameters"
1✔
104

105
    hf_tools = []
1✔
106
    for tool in tools:
1✔
107
        hf_tools_args = {"name": tool.name, "description": tool.description, parameters_name: tool.parameters}
1✔
108

109
        hf_tools.append(
1✔
110
            ChatCompletionInputTool(function=ChatCompletionInputFunctionDefinition(**hf_tools_args), type="function")
111
        )
112

113
    return hf_tools
1✔
114

115

116
def _map_hf_finish_reason_to_haystack(
1✔
117
    choice: Union["ChatCompletionStreamOutputChoice", "ChatCompletionOutputComplete"],
118
) -> Optional[FinishReason]:
119
    """
120
    Map HuggingFace finish reasons to Haystack FinishReason literals.
121

122
    Uses the full choice object to detect tool calls and provide accurate mapping.
123

124
    HuggingFace finish reasons (can be found here https://huggingface.github.io/text-generation-inference/ under
125
    FinishReason):
126
    - "length": number of generated tokens == `max_new_tokens`
127
    - "eos_token": the model generated its end of sequence token
128
    - "stop_sequence": the model generated a text included in `stop_sequences`
129

130
    Additionally detects tool calls from delta.tool_calls or delta.tool_call_id.
131

132
    :param choice: The HuggingFace ChatCompletionStreamOutputChoice object.
133
    :returns: The corresponding Haystack FinishReason or None.
134
    """
135
    if choice.finish_reason is None:
1✔
136
        return None
×
137

138
    # Check if this choice contains tool call information
139
    if isinstance(choice, ChatCompletionStreamOutputChoice):
1✔
140
        has_tool_calls = choice.delta.tool_calls is not None or choice.delta.tool_call_id is not None
1✔
141
    else:
142
        has_tool_calls = choice.message.tool_calls is not None or choice.message.tool_call_id is not None
1✔
143

144
    # If we detect tool calls, override the finish reason
145
    if has_tool_calls:
1✔
146
        return "tool_calls"
1✔
147

148
    # Map HuggingFace finish reasons to Haystack standard ones
149
    mapping: dict[str, FinishReason] = {
1✔
150
        "length": "length",  # Direct match
151
        "eos_token": "stop",  # EOS token means natural stop
152
        "stop_sequence": "stop",  # Stop sequence means natural stop
153
    }
154

155
    return mapping.get(choice.finish_reason, "stop")  # Default to "stop" for unknown reasons
1✔
156

157

158
def _convert_chat_completion_stream_output_to_streaming_chunk(
1✔
159
    chunk: "ChatCompletionStreamOutput",
160
    previous_chunks: list[StreamingChunk],
161
    component_info: Optional[ComponentInfo] = None,
162
) -> StreamingChunk:
163
    """
164
    Converts the Hugging Face API ChatCompletionStreamOutput to a StreamingChunk.
165
    """
166
    # Choices is empty if include_usage is set to True where the usage information is returned.
167
    if len(chunk.choices) == 0:
1✔
168
        usage = None
1✔
169
        if chunk.usage:
1✔
170
            usage = {"prompt_tokens": chunk.usage.prompt_tokens, "completion_tokens": chunk.usage.completion_tokens}
1✔
171
        return StreamingChunk(
1✔
172
            content="",
173
            meta={"model": chunk.model, "received_at": datetime.now().isoformat(), "usage": usage},
174
            component_info=component_info,
175
        )
176

177
    # n is unused, so the API always returns only one choice
178
    # the argument is probably allowed for compatibility with OpenAI
179
    # see https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.n
180
    choice = chunk.choices[0]
1✔
181
    mapped_finish_reason = _map_hf_finish_reason_to_haystack(choice) if choice.finish_reason else None
1✔
182
    stream_chunk = StreamingChunk(
1✔
183
        content=choice.delta.content or "",
184
        meta={"model": chunk.model, "received_at": datetime.now().isoformat(), "finish_reason": choice.finish_reason},
185
        component_info=component_info,
186
        # Index must always be 0 since we don't allow tool calls in streaming mode.
187
        index=0 if choice.finish_reason is None else None,
188
        # start is True at the very beginning since first chunk contains role information + first part of the answer.
189
        start=len(previous_chunks) == 0,
190
        finish_reason=mapped_finish_reason,
191
    )
192
    return stream_chunk
1✔
193

194

195
@component
1✔
196
class HuggingFaceAPIChatGenerator:
1✔
197
    """
198
    Completes chats using Hugging Face APIs.
199

200
    HuggingFaceAPIChatGenerator uses the [ChatMessage](https://docs.haystack.deepset.ai/docs/chatmessage)
201
    format for input and output. Use it to generate text with Hugging Face APIs:
202
    - [Serverless Inference API (Inference Providers)](https://huggingface.co/docs/inference-providers)
203
    - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints)
204
    - [Self-hosted Text Generation Inference](https://github.com/huggingface/text-generation-inference)
205

206
    ### Usage examples
207

208
    #### With the serverless inference API (Inference Providers) - free tier available
209

210
    ```python
211
    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
212
    from haystack.dataclasses import ChatMessage
213
    from haystack.utils import Secret
214
    from haystack.utils.hf import HFGenerationAPIType
215

216
    messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"),
217
                ChatMessage.from_user("What's Natural Language Processing?")]
218

219
    # the api_type can be expressed using the HFGenerationAPIType enum or as a string
220
    api_type = HFGenerationAPIType.SERVERLESS_INFERENCE_API
221
    api_type = "serverless_inference_api" # this is equivalent to the above
222

223
    generator = HuggingFaceAPIChatGenerator(api_type=api_type,
224
                                            api_params={"model": "Qwen/Qwen2.5-7B-Instruct",
225
                                                        "provider": "together"},
226
                                            token=Secret.from_token("<your-api-key>"))
227

228
    result = generator.run(messages)
229
    print(result)
230
    ```
231

232
    #### With the serverless inference API (Inference Providers) and text+image input
233

234
    ```python
235
    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
236
    from haystack.dataclasses import ChatMessage, ImageContent
237
    from haystack.utils import Secret
238
    from haystack.utils.hf import HFGenerationAPIType
239

240
    # Create an image from file path, URL, or base64
241
    image = ImageContent.from_file_path("path/to/your/image.jpg")
242

243
    # Create a multimodal message with both text and image
244
    messages = [ChatMessage.from_user(content_parts=["Describe this image in detail", image])]
245

246
    generator = HuggingFaceAPIChatGenerator(
247
        api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
248
        api_params={
249
            "model": "Qwen/Qwen2.5-VL-7B-Instruct",  # Vision Language Model
250
            "provider": "hyperbolic"
251
        },
252
        token=Secret.from_token("<your-api-key>")
253
    )
254

255
    result = generator.run(messages)
256
    print(result)
257
    ```
258

259
    #### With paid inference endpoints
260

261
    ```python
262
    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
263
    from haystack.dataclasses import ChatMessage
264
    from haystack.utils import Secret
265

266
    messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"),
267
                ChatMessage.from_user("What's Natural Language Processing?")]
268

269
    generator = HuggingFaceAPIChatGenerator(api_type="inference_endpoints",
270
                                            api_params={"url": "<your-inference-endpoint-url>"},
271
                                            token=Secret.from_token("<your-api-key>"))
272

273
    result = generator.run(messages)
274
    print(result)
275

276
    #### With self-hosted text generation inference
277

278
    ```python
279
    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
280
    from haystack.dataclasses import ChatMessage
281

282
    messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"),
283
                ChatMessage.from_user("What's Natural Language Processing?")]
284

285
    generator = HuggingFaceAPIChatGenerator(api_type="text_generation_inference",
286
                                            api_params={"url": "http://localhost:8080"})
287

288
    result = generator.run(messages)
289
    print(result)
290
    ```
291
    """
292

293
    def __init__(  # pylint: disable=too-many-positional-arguments
1✔
294
        self,
295
        api_type: Union[HFGenerationAPIType, str],
296
        api_params: dict[str, str],
297
        token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
298
        generation_kwargs: Optional[dict[str, Any]] = None,
299
        stop_words: Optional[list[str]] = None,
300
        streaming_callback: Optional[StreamingCallbackT] = None,
301
        tools: Optional[Union[list[Tool], Toolset]] = None,
302
    ):
303
        """
304
        Initialize the HuggingFaceAPIChatGenerator instance.
305

306
        :param api_type:
307
            The type of Hugging Face API to use. Available types:
308
            - `text_generation_inference`: See [TGI](https://github.com/huggingface/text-generation-inference).
309
            - `inference_endpoints`: See [Inference Endpoints](https://huggingface.co/inference-endpoints).
310
            - `serverless_inference_api`: See
311
            [Serverless Inference API - Inference Providers](https://huggingface.co/docs/inference-providers).
312
        :param api_params:
313
            A dictionary with the following keys:
314
            - `model`: Hugging Face model ID. Required when `api_type` is `SERVERLESS_INFERENCE_API`.
315
            - `provider`: Provider name. Recommended when `api_type` is `SERVERLESS_INFERENCE_API`.
316
            - `url`: URL of the inference endpoint. Required when `api_type` is `INFERENCE_ENDPOINTS` or
317
            `TEXT_GENERATION_INFERENCE`.
318
            - Other parameters specific to the chosen API type, such as `timeout`, `headers`, etc.
319
        :param token:
320
            The Hugging Face token to use as HTTP bearer authorization.
321
            Check your HF token in your [account settings](https://huggingface.co/settings/tokens).
322
        :param generation_kwargs:
323
            A dictionary with keyword arguments to customize text generation.
324
                Some examples: `max_tokens`, `temperature`, `top_p`.
325
                For details, see [Hugging Face chat_completion documentation](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion).
326
        :param stop_words:
327
            An optional list of strings representing the stop words.
328
        :param streaming_callback:
329
            An optional callable for handling streaming responses.
330
        :param tools:
331
            A list of tools or a Toolset for which the model can prepare calls.
332
            The chosen model should support tool/function calling, according to the model card.
333
            Support for tools in the Hugging Face API and TGI is not yet fully refined and you may experience
334
            unexpected behavior. This parameter can accept either a list of `Tool` objects or a `Toolset` instance.
335
        """
336

337
        huggingface_hub_import.check()
1✔
338

339
        if isinstance(api_type, str):
1✔
340
            api_type = HFGenerationAPIType.from_str(api_type)
1✔
341

342
        if api_type == HFGenerationAPIType.SERVERLESS_INFERENCE_API:
1✔
343
            model = api_params.get("model")
1✔
344
            if model is None:
1✔
345
                raise ValueError(
1✔
346
                    "To use the Serverless Inference API, you need to specify the `model` parameter in `api_params`."
347
                )
348
            check_valid_model(model, HFModelType.GENERATION, token)
1✔
349
            model_or_url = model
1✔
350
        elif api_type in [HFGenerationAPIType.INFERENCE_ENDPOINTS, HFGenerationAPIType.TEXT_GENERATION_INFERENCE]:
1✔
351
            url = api_params.get("url")
1✔
352
            if url is None:
1✔
353
                msg = (
1✔
354
                    "To use Text Generation Inference or Inference Endpoints, you need to specify the `url` parameter "
355
                    "in `api_params`."
356
                )
357
                raise ValueError(msg)
1✔
358
            if not is_valid_http_url(url):
1✔
359
                raise ValueError(f"Invalid URL: {url}")
1✔
360
            model_or_url = url
1✔
361
        else:
362
            msg = f"Unknown api_type {api_type}"
×
363
            raise ValueError(msg)
×
364

365
        if tools and streaming_callback is not None:
1✔
366
            raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.")
1✔
367
        _check_duplicate_tool_names(list(tools or []))
1✔
368

369
        # handle generation kwargs setup
370
        generation_kwargs = generation_kwargs.copy() if generation_kwargs else {}
1✔
371
        generation_kwargs["stop"] = generation_kwargs.get("stop", [])
1✔
372
        generation_kwargs["stop"].extend(stop_words or [])
1✔
373
        generation_kwargs.setdefault("max_tokens", 512)
1✔
374

375
        self.api_type = api_type
1✔
376
        self.api_params = api_params
1✔
377
        self.token = token
1✔
378
        self.generation_kwargs = generation_kwargs
1✔
379
        self.streaming_callback = streaming_callback
1✔
380

381
        resolved_api_params: dict[str, Any] = {k: v for k, v in api_params.items() if k != "model" and k != "url"}
1✔
382
        self._client = InferenceClient(
1✔
383
            model_or_url, token=token.resolve_value() if token else None, **resolved_api_params
384
        )
385
        self._async_client = AsyncInferenceClient(
1✔
386
            model_or_url, token=token.resolve_value() if token else None, **resolved_api_params
387
        )
388
        self.tools = tools
1✔
389

390
    def to_dict(self) -> dict[str, Any]:
1✔
391
        """
392
        Serialize this component to a dictionary.
393

394
        :returns:
395
            A dictionary containing the serialized component.
396
        """
397
        callback_name = serialize_callable(self.streaming_callback) if self.streaming_callback else None
1✔
398
        return default_to_dict(
1✔
399
            self,
400
            api_type=str(self.api_type),
401
            api_params=self.api_params,
402
            token=self.token.to_dict() if self.token else None,
403
            generation_kwargs=self.generation_kwargs,
404
            streaming_callback=callback_name,
405
            tools=serialize_tools_or_toolset(self.tools),
406
        )
407

408
    @classmethod
1✔
409
    def from_dict(cls, data: dict[str, Any]) -> "HuggingFaceAPIChatGenerator":
1✔
410
        """
411
        Deserialize this component from a dictionary.
412
        """
413
        deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
1✔
414
        deserialize_tools_or_toolset_inplace(data["init_parameters"], key="tools")
1✔
415
        init_params = data.get("init_parameters", {})
1✔
416
        serialized_callback_handler = init_params.get("streaming_callback")
1✔
417
        if serialized_callback_handler:
1✔
418
            data["init_parameters"]["streaming_callback"] = deserialize_callable(serialized_callback_handler)
×
419
        return default_from_dict(cls, data)
1✔
420

421
    @component.output_types(replies=list[ChatMessage])
1✔
422
    def run(
1✔
423
        self,
424
        messages: list[ChatMessage],
425
        generation_kwargs: Optional[dict[str, Any]] = None,
426
        tools: Optional[Union[list[Tool], Toolset]] = None,
427
        streaming_callback: Optional[StreamingCallbackT] = None,
428
    ):
429
        """
430
        Invoke the text generation inference based on the provided messages and generation parameters.
431

432
        :param messages:
433
            A list of ChatMessage objects representing the input messages.
434
        :param generation_kwargs:
435
            Additional keyword arguments for text generation.
436
        :param tools:
437
            A list of tools or a Toolset for which the model can prepare calls. If set, it will override
438
            the `tools` parameter set during component initialization. This parameter can accept either a
439
            list of `Tool` objects or a `Toolset` instance.
440
        :param streaming_callback:
441
            An optional callable for handling streaming responses. If set, it will override the `streaming_callback`
442
            parameter set during component initialization.
443
        :returns: A dictionary with the following keys:
444
            - `replies`: A list containing the generated responses as ChatMessage objects.
445
        """
446

447
        # update generation kwargs by merging with the default ones
448
        generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
1✔
449

450
        formatted_messages = [convert_message_to_hf_format(message) for message in messages]
1✔
451

452
        tools = tools or self.tools
1✔
453
        if tools and self.streaming_callback:
1✔
454
            raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.")
1✔
455
        _check_duplicate_tool_names(list(tools or []))
1✔
456

457
        # validate and select the streaming callback
458
        streaming_callback = select_streaming_callback(
1✔
459
            self.streaming_callback, streaming_callback, requires_async=False
460
        )
461

462
        if streaming_callback:
1✔
463
            return self._run_streaming(formatted_messages, generation_kwargs, streaming_callback)
1✔
464

465
        if tools and isinstance(tools, Toolset):
1✔
466
            tools = list(tools)
×
467

468
        hf_tools = _convert_tools_to_hfapi_tools(tools)
1✔
469

470
        return self._run_non_streaming(formatted_messages, generation_kwargs, hf_tools)
1✔
471

472
    @component.output_types(replies=list[ChatMessage])
1✔
473
    async def run_async(
1✔
474
        self,
475
        messages: list[ChatMessage],
476
        generation_kwargs: Optional[dict[str, Any]] = None,
477
        tools: Optional[Union[list[Tool], Toolset]] = None,
478
        streaming_callback: Optional[StreamingCallbackT] = None,
479
    ):
480
        """
481
        Asynchronously invokes the text generation inference based on the provided messages and generation parameters.
482

483
        This is the asynchronous version of the `run` method. It has the same parameters
484
        and return values but can be used with `await` in an async code.
485

486
        :param messages:
487
            A list of ChatMessage objects representing the input messages.
488
        :param generation_kwargs:
489
            Additional keyword arguments for text generation.
490
        :param tools:
491
            A list of tools or a Toolset for which the model can prepare calls. If set, it will override the `tools`
492
            parameter set during component initialization. This parameter can accept either a list of `Tool` objects
493
            or a `Toolset` instance.
494
        :param streaming_callback:
495
            An optional callable for handling streaming responses. If set, it will override the `streaming_callback`
496
            parameter set during component initialization.
497
        :returns: A dictionary with the following keys:
498
            - `replies`: A list containing the generated responses as ChatMessage objects.
499
        """
500

501
        # update generation kwargs by merging with the default ones
502
        generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
1✔
503

504
        formatted_messages = [convert_message_to_hf_format(message) for message in messages]
1✔
505

506
        tools = tools or self.tools
1✔
507
        if tools and self.streaming_callback:
1✔
508
            raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.")
×
509
        _check_duplicate_tool_names(list(tools or []))
1✔
510

511
        # validate and select the streaming callback
512
        streaming_callback = select_streaming_callback(self.streaming_callback, streaming_callback, requires_async=True)
1✔
513

514
        if streaming_callback:
1✔
515
            return await self._run_streaming_async(formatted_messages, generation_kwargs, streaming_callback)
1✔
516

517
        if tools and isinstance(tools, Toolset):
1✔
518
            tools = list(tools)
×
519

520
        hf_tools = _convert_tools_to_hfapi_tools(tools)
1✔
521

522
        return await self._run_non_streaming_async(formatted_messages, generation_kwargs, hf_tools)
1✔
523

524
    def _run_streaming(
1✔
525
        self,
526
        messages: list[dict[str, str]],
527
        generation_kwargs: dict[str, Any],
528
        streaming_callback: SyncStreamingCallbackT,
529
    ):
530
        api_output: Iterable[ChatCompletionStreamOutput] = self._client.chat_completion(
1✔
531
            messages,
532
            stream=True,
533
            stream_options=ChatCompletionInputStreamOptions(include_usage=True),
534
            **generation_kwargs,
535
        )
536

537
        component_info = ComponentInfo.from_component(self)
1✔
538
        streaming_chunks: list[StreamingChunk] = []
1✔
539
        for chunk in api_output:
1✔
540
            streaming_chunk = _convert_chat_completion_stream_output_to_streaming_chunk(
1✔
541
                chunk=chunk, previous_chunks=streaming_chunks, component_info=component_info
542
            )
543
            streaming_chunks.append(streaming_chunk)
1✔
544
            streaming_callback(streaming_chunk)
1✔
545

546
        message = _convert_streaming_chunks_to_chat_message(chunks=streaming_chunks)
1✔
547
        if message.meta.get("usage") is None:
1✔
548
            message.meta["usage"] = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
549

550
        return {"replies": [message]}
1✔
551

552
    def _run_non_streaming(
1✔
553
        self,
554
        messages: list[dict[str, str]],
555
        generation_kwargs: dict[str, Any],
556
        tools: Optional[list["ChatCompletionInputTool"]] = None,
557
    ) -> dict[str, list[ChatMessage]]:
558
        api_chat_output: ChatCompletionOutput = self._client.chat_completion(
1✔
559
            messages=messages, tools=tools, **generation_kwargs
560
        )
561

562
        if api_chat_output.choices is None or len(api_chat_output.choices) == 0:
1✔
563
            return {"replies": []}
×
564

565
        # n is unused, so the API always returns only one choice
566
        # the argument is probably allowed for compatibility with OpenAI
567
        # see https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.n
568
        choice = api_chat_output.choices[0]
1✔
569

570
        text = choice.message.content
1✔
571

572
        tool_calls = _convert_hfapi_tool_calls(choice.message.tool_calls)
1✔
573

574
        mapped_finish_reason = _map_hf_finish_reason_to_haystack(choice) if choice.finish_reason else None
1✔
575
        meta: dict[str, Any] = {
1✔
576
            "model": self._client.model,
577
            "finish_reason": mapped_finish_reason,
578
            "index": choice.index,
579
        }
580

581
        usage = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
582
        if api_chat_output.usage:
1✔
583
            usage = {
1✔
584
                "prompt_tokens": api_chat_output.usage.prompt_tokens,
585
                "completion_tokens": api_chat_output.usage.completion_tokens,
586
            }
587
        meta["usage"] = usage
1✔
588

589
        message = ChatMessage.from_assistant(text=text, tool_calls=tool_calls, meta=meta)
1✔
590
        return {"replies": [message]}
1✔
591

592
    async def _run_streaming_async(
1✔
593
        self,
594
        messages: list[dict[str, str]],
595
        generation_kwargs: dict[str, Any],
596
        streaming_callback: AsyncStreamingCallbackT,
597
    ):
598
        api_output: AsyncIterable[ChatCompletionStreamOutput] = await self._async_client.chat_completion(
1✔
599
            messages,
600
            stream=True,
601
            stream_options=ChatCompletionInputStreamOptions(include_usage=True),
602
            **generation_kwargs,
603
        )
604

605
        component_info = ComponentInfo.from_component(self)
1✔
606
        streaming_chunks: list[StreamingChunk] = []
1✔
607
        async for chunk in api_output:
1✔
608
            stream_chunk = _convert_chat_completion_stream_output_to_streaming_chunk(
1✔
609
                chunk=chunk, previous_chunks=streaming_chunks, component_info=component_info
610
            )
611
            streaming_chunks.append(stream_chunk)
1✔
612
            await streaming_callback(stream_chunk)
1✔
613

614
        message = _convert_streaming_chunks_to_chat_message(chunks=streaming_chunks)
1✔
615
        if message.meta.get("usage") is None:
1✔
616
            message.meta["usage"] = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
617

618
        return {"replies": [message]}
1✔
619

620
    async def _run_non_streaming_async(
1✔
621
        self,
622
        messages: list[dict[str, str]],
623
        generation_kwargs: dict[str, Any],
624
        tools: Optional[list["ChatCompletionInputTool"]] = None,
625
    ) -> dict[str, list[ChatMessage]]:
626
        api_chat_output: ChatCompletionOutput = await self._async_client.chat_completion(
1✔
627
            messages=messages, tools=tools, **generation_kwargs
628
        )
629

630
        if api_chat_output.choices is None or len(api_chat_output.choices) == 0:
1✔
631
            return {"replies": []}
×
632

633
        choice = api_chat_output.choices[0]
1✔
634

635
        text = choice.message.content
1✔
636

637
        tool_calls = _convert_hfapi_tool_calls(choice.message.tool_calls)
1✔
638

639
        mapped_finish_reason = _map_hf_finish_reason_to_haystack(choice) if choice.finish_reason else None
1✔
640
        meta: dict[str, Any] = {
1✔
641
            "model": self._async_client.model,
642
            "finish_reason": mapped_finish_reason,
643
            "index": choice.index,
644
        }
645

646
        usage = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
647
        if api_chat_output.usage:
1✔
648
            usage = {
1✔
649
                "prompt_tokens": api_chat_output.usage.prompt_tokens,
650
                "completion_tokens": api_chat_output.usage.completion_tokens,
651
            }
652
        meta["usage"] = usage
1✔
653

654
        message = ChatMessage.from_assistant(text=text, tool_calls=tool_calls, meta=meta)
1✔
655
        return {"replies": [message]}
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc