• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 18595249452

17 Oct 2025 02:08PM UTC coverage: 92.22% (+0.02%) from 92.2%
18595249452

Pull #9886

github

web-flow
Merge ad30d1879 into cc4f024af
Pull Request #9886: feat: Update tools param to Optional[Union[list[Union[Tool, Toolset]], Toolset]]

13382 of 14511 relevant lines covered (92.22%)

0.92 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.39
haystack/components/generators/chat/hugging_face_api.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
import json
1✔
6
from datetime import datetime
1✔
7
from typing import Any, AsyncIterable, Iterable, Optional, Union
1✔
8

9
from haystack import component, default_from_dict, default_to_dict, logging
1✔
10
from haystack.components.generators.utils import _convert_streaming_chunks_to_chat_message
1✔
11
from haystack.dataclasses import (
1✔
12
    AsyncStreamingCallbackT,
13
    ChatMessage,
14
    ComponentInfo,
15
    StreamingCallbackT,
16
    StreamingChunk,
17
    SyncStreamingCallbackT,
18
    ToolCall,
19
    select_streaming_callback,
20
)
21
from haystack.dataclasses.streaming_chunk import FinishReason
1✔
22
from haystack.lazy_imports import LazyImport
1✔
23
from haystack.tools import (
1✔
24
    ToolsType,
25
    _check_duplicate_tool_names,
26
    deserialize_tools_or_toolset_inplace,
27
    flatten_tools_or_toolsets,
28
    serialize_tools_or_toolset,
29
)
30
from haystack.utils import Secret, deserialize_callable, deserialize_secrets_inplace, serialize_callable
1✔
31
from haystack.utils.hf import HFGenerationAPIType, HFModelType, check_valid_model, convert_message_to_hf_format
1✔
32
from haystack.utils.url_validation import is_valid_http_url
1✔
33

34
logger = logging.getLogger(__name__)
1✔
35

36
with LazyImport(message="Run 'pip install \"huggingface_hub[inference]>=0.27.0\"'") as huggingface_hub_import:
1✔
37
    from huggingface_hub import (
1✔
38
        AsyncInferenceClient,
39
        ChatCompletionInputFunctionDefinition,
40
        ChatCompletionInputStreamOptions,
41
        ChatCompletionInputTool,
42
        ChatCompletionOutput,
43
        ChatCompletionOutputComplete,
44
        ChatCompletionOutputToolCall,
45
        ChatCompletionStreamOutput,
46
        ChatCompletionStreamOutputChoice,
47
        InferenceClient,
48
    )
49

50

51
def _convert_hfapi_tool_calls(hfapi_tool_calls: Optional[list["ChatCompletionOutputToolCall"]]) -> list[ToolCall]:
1✔
52
    """
53
    Convert HuggingFace API tool calls to a list of Haystack ToolCall.
54

55
    :param hfapi_tool_calls: The HuggingFace API tool calls to convert.
56
    :returns: A list of ToolCall objects.
57

58
    """
59
    if not hfapi_tool_calls:
1✔
60
        return []
1✔
61

62
    tool_calls = []
1✔
63

64
    for hfapi_tc in hfapi_tool_calls:
1✔
65
        hf_arguments = hfapi_tc.function.arguments
1✔
66

67
        arguments = None
1✔
68
        if isinstance(hf_arguments, dict):
1✔
69
            arguments = hf_arguments
1✔
70
        elif isinstance(hf_arguments, str):
1✔
71
            try:
1✔
72
                arguments = json.loads(hf_arguments)
1✔
73
            except json.JSONDecodeError:
1✔
74
                logger.warning(
1✔
75
                    "HuggingFace API returned a malformed JSON string for tool call arguments. This tool call "
76
                    "will be skipped. Tool call ID: {_id}, Tool name: {_name}, Arguments: {_arguments}",
77
                    _id=hfapi_tc.id,
78
                    _name=hfapi_tc.function.name,
79
                    _arguments=hf_arguments,
80
                )
81
        else:
82
            logger.warning(
1✔
83
                "HuggingFace API returned tool call arguments of type {_type}. Valid types are dict and str. This tool "
84
                "call will be skipped. Tool call ID: {_id}, Tool name: {_name}, Arguments: {_arguments}",
85
                _id=hfapi_tc.id,
86
                _name=hfapi_tc.function.name,
87
                _arguments=hf_arguments,
88
            )
89

90
        if arguments:
1✔
91
            tool_calls.append(ToolCall(tool_name=hfapi_tc.function.name, arguments=arguments, id=hfapi_tc.id))
1✔
92

93
    return tool_calls
1✔
94

95

96
def _convert_tools_to_hfapi_tools(tools: Optional[ToolsType]) -> Optional[list["ChatCompletionInputTool"]]:
1✔
97
    if not tools:
1✔
98
        return None
1✔
99

100
    # huggingface_hub<0.31.0 uses "arguments", huggingface_hub>=0.31.0 uses "parameters"
101
    parameters_name = "arguments" if hasattr(ChatCompletionInputFunctionDefinition, "arguments") else "parameters"
1✔
102

103
    hf_tools = []
1✔
104
    for tool in flatten_tools_or_toolsets(tools):
1✔
105
        hf_tools_args = {"name": tool.name, "description": tool.description, parameters_name: tool.parameters}
1✔
106

107
        hf_tools.append(
1✔
108
            ChatCompletionInputTool(function=ChatCompletionInputFunctionDefinition(**hf_tools_args), type="function")
109
        )
110

111
    return hf_tools
1✔
112

113

114
def _map_hf_finish_reason_to_haystack(
1✔
115
    choice: Union["ChatCompletionStreamOutputChoice", "ChatCompletionOutputComplete"],
116
) -> Optional[FinishReason]:
117
    """
118
    Map HuggingFace finish reasons to Haystack FinishReason literals.
119

120
    Uses the full choice object to detect tool calls and provide accurate mapping.
121

122
    HuggingFace finish reasons (can be found here https://huggingface.github.io/text-generation-inference/ under
123
    FinishReason):
124
    - "length": number of generated tokens == `max_new_tokens`
125
    - "eos_token": the model generated its end of sequence token
126
    - "stop_sequence": the model generated a text included in `stop_sequences`
127

128
    Additionally detects tool calls from delta.tool_calls or delta.tool_call_id.
129

130
    :param choice: The HuggingFace ChatCompletionStreamOutputChoice object.
131
    :returns: The corresponding Haystack FinishReason or None.
132
    """
133
    if choice.finish_reason is None:
1✔
134
        return None
×
135

136
    # Check if this choice contains tool call information
137
    if isinstance(choice, ChatCompletionStreamOutputChoice):
1✔
138
        has_tool_calls = choice.delta.tool_calls is not None or choice.delta.tool_call_id is not None
1✔
139
    else:
140
        has_tool_calls = choice.message.tool_calls is not None or choice.message.tool_call_id is not None
1✔
141

142
    # If we detect tool calls, override the finish reason
143
    if has_tool_calls:
1✔
144
        return "tool_calls"
1✔
145

146
    # Map HuggingFace finish reasons to Haystack standard ones
147
    mapping: dict[str, FinishReason] = {
1✔
148
        "length": "length",  # Direct match
149
        "eos_token": "stop",  # EOS token means natural stop
150
        "stop_sequence": "stop",  # Stop sequence means natural stop
151
    }
152

153
    return mapping.get(choice.finish_reason, "stop")  # Default to "stop" for unknown reasons
1✔
154

155

156
def _convert_chat_completion_stream_output_to_streaming_chunk(
1✔
157
    chunk: "ChatCompletionStreamOutput",
158
    previous_chunks: list[StreamingChunk],
159
    component_info: Optional[ComponentInfo] = None,
160
) -> StreamingChunk:
161
    """
162
    Converts the Hugging Face API ChatCompletionStreamOutput to a StreamingChunk.
163
    """
164
    # Choices is empty if include_usage is set to True where the usage information is returned.
165
    if len(chunk.choices) == 0:
1✔
166
        usage = None
1✔
167
        if chunk.usage:
1✔
168
            usage = {"prompt_tokens": chunk.usage.prompt_tokens, "completion_tokens": chunk.usage.completion_tokens}
1✔
169
        return StreamingChunk(
1✔
170
            content="",
171
            meta={"model": chunk.model, "received_at": datetime.now().isoformat(), "usage": usage},
172
            component_info=component_info,
173
        )
174

175
    # n is unused, so the API always returns only one choice
176
    # the argument is probably allowed for compatibility with OpenAI
177
    # see https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.n
178
    choice = chunk.choices[0]
1✔
179
    mapped_finish_reason = _map_hf_finish_reason_to_haystack(choice) if choice.finish_reason else None
1✔
180
    stream_chunk = StreamingChunk(
1✔
181
        content=choice.delta.content or "",
182
        meta={"model": chunk.model, "received_at": datetime.now().isoformat(), "finish_reason": choice.finish_reason},
183
        component_info=component_info,
184
        # Index must always be 0 since we don't allow tool calls in streaming mode.
185
        index=0 if choice.finish_reason is None else None,
186
        # start is True at the very beginning since first chunk contains role information + first part of the answer.
187
        start=len(previous_chunks) == 0,
188
        finish_reason=mapped_finish_reason,
189
    )
190
    return stream_chunk
1✔
191

192

193
@component
1✔
194
class HuggingFaceAPIChatGenerator:
1✔
195
    """
196
    Completes chats using Hugging Face APIs.
197

198
    HuggingFaceAPIChatGenerator uses the [ChatMessage](https://docs.haystack.deepset.ai/docs/chatmessage)
199
    format for input and output. Use it to generate text with Hugging Face APIs:
200
    - [Serverless Inference API (Inference Providers)](https://huggingface.co/docs/inference-providers)
201
    - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints)
202
    - [Self-hosted Text Generation Inference](https://github.com/huggingface/text-generation-inference)
203

204
    ### Usage examples
205

206
    #### With the serverless inference API (Inference Providers) - free tier available
207

208
    ```python
209
    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
210
    from haystack.dataclasses import ChatMessage
211
    from haystack.utils import Secret
212
    from haystack.utils.hf import HFGenerationAPIType
213

214
    messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"),
215
                ChatMessage.from_user("What's Natural Language Processing?")]
216

217
    # the api_type can be expressed using the HFGenerationAPIType enum or as a string
218
    api_type = HFGenerationAPIType.SERVERLESS_INFERENCE_API
219
    api_type = "serverless_inference_api" # this is equivalent to the above
220

221
    generator = HuggingFaceAPIChatGenerator(api_type=api_type,
222
                                            api_params={"model": "Qwen/Qwen2.5-7B-Instruct",
223
                                                        "provider": "together"},
224
                                            token=Secret.from_token("<your-api-key>"))
225

226
    result = generator.run(messages)
227
    print(result)
228
    ```
229

230
    #### With the serverless inference API (Inference Providers) and text+image input
231

232
    ```python
233
    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
234
    from haystack.dataclasses import ChatMessage, ImageContent
235
    from haystack.utils import Secret
236
    from haystack.utils.hf import HFGenerationAPIType
237

238
    # Create an image from file path, URL, or base64
239
    image = ImageContent.from_file_path("path/to/your/image.jpg")
240

241
    # Create a multimodal message with both text and image
242
    messages = [ChatMessage.from_user(content_parts=["Describe this image in detail", image])]
243

244
    generator = HuggingFaceAPIChatGenerator(
245
        api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
246
        api_params={
247
            "model": "Qwen/Qwen2.5-VL-7B-Instruct",  # Vision Language Model
248
            "provider": "hyperbolic"
249
        },
250
        token=Secret.from_token("<your-api-key>")
251
    )
252

253
    result = generator.run(messages)
254
    print(result)
255
    ```
256

257
    #### With paid inference endpoints
258

259
    ```python
260
    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
261
    from haystack.dataclasses import ChatMessage
262
    from haystack.utils import Secret
263

264
    messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"),
265
                ChatMessage.from_user("What's Natural Language Processing?")]
266

267
    generator = HuggingFaceAPIChatGenerator(api_type="inference_endpoints",
268
                                            api_params={"url": "<your-inference-endpoint-url>"},
269
                                            token=Secret.from_token("<your-api-key>"))
270

271
    result = generator.run(messages)
272
    print(result)
273

274
    #### With self-hosted text generation inference
275

276
    ```python
277
    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
278
    from haystack.dataclasses import ChatMessage
279

280
    messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"),
281
                ChatMessage.from_user("What's Natural Language Processing?")]
282

283
    generator = HuggingFaceAPIChatGenerator(api_type="text_generation_inference",
284
                                            api_params={"url": "http://localhost:8080"})
285

286
    result = generator.run(messages)
287
    print(result)
288
    ```
289
    """
290

291
    def __init__(  # pylint: disable=too-many-positional-arguments
1✔
292
        self,
293
        api_type: Union[HFGenerationAPIType, str],
294
        api_params: dict[str, str],
295
        token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
296
        generation_kwargs: Optional[dict[str, Any]] = None,
297
        stop_words: Optional[list[str]] = None,
298
        streaming_callback: Optional[StreamingCallbackT] = None,
299
        tools: Optional[ToolsType] = None,
300
    ):
301
        """
302
        Initialize the HuggingFaceAPIChatGenerator instance.
303

304
        :param api_type:
305
            The type of Hugging Face API to use. Available types:
306
            - `text_generation_inference`: See [TGI](https://github.com/huggingface/text-generation-inference).
307
            - `inference_endpoints`: See [Inference Endpoints](https://huggingface.co/inference-endpoints).
308
            - `serverless_inference_api`: See
309
            [Serverless Inference API - Inference Providers](https://huggingface.co/docs/inference-providers).
310
        :param api_params:
311
            A dictionary with the following keys:
312
            - `model`: Hugging Face model ID. Required when `api_type` is `SERVERLESS_INFERENCE_API`.
313
            - `provider`: Provider name. Recommended when `api_type` is `SERVERLESS_INFERENCE_API`.
314
            - `url`: URL of the inference endpoint. Required when `api_type` is `INFERENCE_ENDPOINTS` or
315
            `TEXT_GENERATION_INFERENCE`.
316
            - Other parameters specific to the chosen API type, such as `timeout`, `headers`, etc.
317
        :param token:
318
            The Hugging Face token to use as HTTP bearer authorization.
319
            Check your HF token in your [account settings](https://huggingface.co/settings/tokens).
320
        :param generation_kwargs:
321
            A dictionary with keyword arguments to customize text generation.
322
                Some examples: `max_tokens`, `temperature`, `top_p`.
323
                For details, see [Hugging Face chat_completion documentation](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion).
324
        :param stop_words:
325
            An optional list of strings representing the stop words.
326
        :param streaming_callback:
327
            An optional callable for handling streaming responses.
328
        :param tools:
329
            A list of Tool and/or Toolset objects, or a single Toolset for which the model can prepare calls.
330
            The chosen model should support tool/function calling, according to the model card.
331
            Support for tools in the Hugging Face API and TGI is not yet fully refined and you may experience
332
            unexpected behavior.
333
        """
334

335
        huggingface_hub_import.check()
1✔
336

337
        if isinstance(api_type, str):
1✔
338
            api_type = HFGenerationAPIType.from_str(api_type)
1✔
339

340
        if api_type == HFGenerationAPIType.SERVERLESS_INFERENCE_API:
1✔
341
            model = api_params.get("model")
1✔
342
            if model is None:
1✔
343
                raise ValueError(
1✔
344
                    "To use the Serverless Inference API, you need to specify the `model` parameter in `api_params`."
345
                )
346
            check_valid_model(model, HFModelType.GENERATION, token)
1✔
347
            model_or_url = model
1✔
348
        elif api_type in [HFGenerationAPIType.INFERENCE_ENDPOINTS, HFGenerationAPIType.TEXT_GENERATION_INFERENCE]:
1✔
349
            url = api_params.get("url")
1✔
350
            if url is None:
1✔
351
                msg = (
1✔
352
                    "To use Text Generation Inference or Inference Endpoints, you need to specify the `url` parameter "
353
                    "in `api_params`."
354
                )
355
                raise ValueError(msg)
1✔
356
            if not is_valid_http_url(url):
1✔
357
                raise ValueError(f"Invalid URL: {url}")
1✔
358
            model_or_url = url
1✔
359
        else:
360
            msg = f"Unknown api_type {api_type}"
×
361
            raise ValueError(msg)
×
362

363
        if tools and streaming_callback is not None:
1✔
364
            raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.")
1✔
365
        _check_duplicate_tool_names(flatten_tools_or_toolsets(tools))
1✔
366

367
        # handle generation kwargs setup
368
        generation_kwargs = generation_kwargs.copy() if generation_kwargs else {}
1✔
369
        generation_kwargs["stop"] = generation_kwargs.get("stop", [])
1✔
370
        generation_kwargs["stop"].extend(stop_words or [])
1✔
371
        generation_kwargs.setdefault("max_tokens", 512)
1✔
372

373
        self.api_type = api_type
1✔
374
        self.api_params = api_params
1✔
375
        self.token = token
1✔
376
        self.generation_kwargs = generation_kwargs
1✔
377
        self.streaming_callback = streaming_callback
1✔
378

379
        resolved_api_params: dict[str, Any] = {k: v for k, v in api_params.items() if k != "model" and k != "url"}
1✔
380
        self._client = InferenceClient(
1✔
381
            model_or_url, token=token.resolve_value() if token else None, **resolved_api_params
382
        )
383
        self._async_client = AsyncInferenceClient(
1✔
384
            model_or_url, token=token.resolve_value() if token else None, **resolved_api_params
385
        )
386
        self.tools = tools
1✔
387

388
    def to_dict(self) -> dict[str, Any]:
1✔
389
        """
390
        Serialize this component to a dictionary.
391

392
        :returns:
393
            A dictionary containing the serialized component.
394
        """
395
        callback_name = serialize_callable(self.streaming_callback) if self.streaming_callback else None
1✔
396
        return default_to_dict(
1✔
397
            self,
398
            api_type=str(self.api_type),
399
            api_params=self.api_params,
400
            token=self.token.to_dict() if self.token else None,
401
            generation_kwargs=self.generation_kwargs,
402
            streaming_callback=callback_name,
403
            tools=serialize_tools_or_toolset(self.tools),
404
        )
405

406
    @classmethod
1✔
407
    def from_dict(cls, data: dict[str, Any]) -> "HuggingFaceAPIChatGenerator":
1✔
408
        """
409
        Deserialize this component from a dictionary.
410
        """
411
        deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
1✔
412
        deserialize_tools_or_toolset_inplace(data["init_parameters"], key="tools")
1✔
413
        init_params = data.get("init_parameters", {})
1✔
414
        serialized_callback_handler = init_params.get("streaming_callback")
1✔
415
        if serialized_callback_handler:
1✔
416
            data["init_parameters"]["streaming_callback"] = deserialize_callable(serialized_callback_handler)
×
417
        return default_from_dict(cls, data)
1✔
418

419
    @component.output_types(replies=list[ChatMessage])
1✔
420
    def run(
1✔
421
        self,
422
        messages: list[ChatMessage],
423
        generation_kwargs: Optional[dict[str, Any]] = None,
424
        tools: Optional[ToolsType] = None,
425
        streaming_callback: Optional[StreamingCallbackT] = None,
426
    ):
427
        """
428
        Invoke the text generation inference based on the provided messages and generation parameters.
429

430
        :param messages:
431
            A list of ChatMessage objects representing the input messages.
432
        :param generation_kwargs:
433
            Additional keyword arguments for text generation.
434
        :param tools:
435
            A list of tools or a Toolset for which the model can prepare calls. If set, it will override
436
            the `tools` parameter set during component initialization. This parameter can accept either a
437
            list of `Tool` objects or a `Toolset` instance.
438
        :param streaming_callback:
439
            An optional callable for handling streaming responses. If set, it will override the `streaming_callback`
440
            parameter set during component initialization.
441
        :returns: A dictionary with the following keys:
442
            - `replies`: A list containing the generated responses as ChatMessage objects.
443
        """
444

445
        # update generation kwargs by merging with the default ones
446
        generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
1✔
447

448
        formatted_messages = [convert_message_to_hf_format(message) for message in messages]
1✔
449

450
        tools = tools or self.tools
1✔
451
        if tools and self.streaming_callback:
1✔
452
            raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.")
1✔
453
        flat_tools = flatten_tools_or_toolsets(tools)
1✔
454
        _check_duplicate_tool_names(flat_tools)
1✔
455

456
        # validate and select the streaming callback
457
        streaming_callback = select_streaming_callback(
1✔
458
            self.streaming_callback, streaming_callback, requires_async=False
459
        )
460

461
        if streaming_callback:
1✔
462
            return self._run_streaming(formatted_messages, generation_kwargs, streaming_callback)
1✔
463

464
        hf_tools = _convert_tools_to_hfapi_tools(tools)
1✔
465

466
        return self._run_non_streaming(formatted_messages, generation_kwargs, hf_tools)
1✔
467

468
    @component.output_types(replies=list[ChatMessage])
1✔
469
    async def run_async(
1✔
470
        self,
471
        messages: list[ChatMessage],
472
        generation_kwargs: Optional[dict[str, Any]] = None,
473
        tools: Optional[ToolsType] = None,
474
        streaming_callback: Optional[StreamingCallbackT] = None,
475
    ):
476
        """
477
        Asynchronously invokes the text generation inference based on the provided messages and generation parameters.
478

479
        This is the asynchronous version of the `run` method. It has the same parameters
480
        and return values but can be used with `await` in an async code.
481

482
        :param messages:
483
            A list of ChatMessage objects representing the input messages.
484
        :param generation_kwargs:
485
            Additional keyword arguments for text generation.
486
        :param tools:
487
            A list of tools or a Toolset for which the model can prepare calls. If set, it will override the `tools`
488
            parameter set during component initialization. This parameter can accept either a list of `Tool` objects
489
            or a `Toolset` instance.
490
        :param streaming_callback:
491
            An optional callable for handling streaming responses. If set, it will override the `streaming_callback`
492
            parameter set during component initialization.
493
        :returns: A dictionary with the following keys:
494
            - `replies`: A list containing the generated responses as ChatMessage objects.
495
        """
496

497
        # update generation kwargs by merging with the default ones
498
        generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
1✔
499

500
        formatted_messages = [convert_message_to_hf_format(message) for message in messages]
1✔
501

502
        tools = tools or self.tools
1✔
503
        if tools and self.streaming_callback:
1✔
504
            raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.")
×
505
        flat_tools = flatten_tools_or_toolsets(tools)
1✔
506
        _check_duplicate_tool_names(flat_tools)
1✔
507

508
        # validate and select the streaming callback
509
        streaming_callback = select_streaming_callback(self.streaming_callback, streaming_callback, requires_async=True)
1✔
510

511
        if streaming_callback:
1✔
512
            return await self._run_streaming_async(formatted_messages, generation_kwargs, streaming_callback)
1✔
513

514
        hf_tools = _convert_tools_to_hfapi_tools(tools)
1✔
515

516
        return await self._run_non_streaming_async(formatted_messages, generation_kwargs, hf_tools)
1✔
517

518
    def _run_streaming(
1✔
519
        self,
520
        messages: list[dict[str, str]],
521
        generation_kwargs: dict[str, Any],
522
        streaming_callback: SyncStreamingCallbackT,
523
    ):
524
        api_output: Iterable[ChatCompletionStreamOutput] = self._client.chat_completion(
1✔
525
            messages,
526
            stream=True,
527
            stream_options=ChatCompletionInputStreamOptions(include_usage=True),
528
            **generation_kwargs,
529
        )
530

531
        component_info = ComponentInfo.from_component(self)
1✔
532
        streaming_chunks: list[StreamingChunk] = []
1✔
533
        for chunk in api_output:
1✔
534
            streaming_chunk = _convert_chat_completion_stream_output_to_streaming_chunk(
1✔
535
                chunk=chunk, previous_chunks=streaming_chunks, component_info=component_info
536
            )
537
            streaming_chunks.append(streaming_chunk)
1✔
538
            streaming_callback(streaming_chunk)
1✔
539

540
        message = _convert_streaming_chunks_to_chat_message(chunks=streaming_chunks)
1✔
541
        if message.meta.get("usage") is None:
1✔
542
            message.meta["usage"] = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
543

544
        return {"replies": [message]}
1✔
545

546
    def _run_non_streaming(
1✔
547
        self,
548
        messages: list[dict[str, str]],
549
        generation_kwargs: dict[str, Any],
550
        tools: Optional[list["ChatCompletionInputTool"]] = None,
551
    ) -> dict[str, list[ChatMessage]]:
552
        api_chat_output: ChatCompletionOutput = self._client.chat_completion(
1✔
553
            messages=messages, tools=tools, **generation_kwargs
554
        )
555

556
        if api_chat_output.choices is None or len(api_chat_output.choices) == 0:
1✔
557
            return {"replies": []}
×
558

559
        # n is unused, so the API always returns only one choice
560
        # the argument is probably allowed for compatibility with OpenAI
561
        # see https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.n
562
        choice = api_chat_output.choices[0]
1✔
563

564
        text = choice.message.content
1✔
565

566
        tool_calls = _convert_hfapi_tool_calls(choice.message.tool_calls)
1✔
567

568
        mapped_finish_reason = _map_hf_finish_reason_to_haystack(choice) if choice.finish_reason else None
1✔
569
        meta: dict[str, Any] = {
1✔
570
            "model": self._client.model,
571
            "finish_reason": mapped_finish_reason,
572
            "index": choice.index,
573
        }
574

575
        usage = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
576
        if api_chat_output.usage:
1✔
577
            usage = {
1✔
578
                "prompt_tokens": api_chat_output.usage.prompt_tokens,
579
                "completion_tokens": api_chat_output.usage.completion_tokens,
580
            }
581
        meta["usage"] = usage
1✔
582

583
        message = ChatMessage.from_assistant(text=text, tool_calls=tool_calls, meta=meta)
1✔
584
        return {"replies": [message]}
1✔
585

586
    async def _run_streaming_async(
1✔
587
        self,
588
        messages: list[dict[str, str]],
589
        generation_kwargs: dict[str, Any],
590
        streaming_callback: AsyncStreamingCallbackT,
591
    ):
592
        api_output: AsyncIterable[ChatCompletionStreamOutput] = await self._async_client.chat_completion(
1✔
593
            messages,
594
            stream=True,
595
            stream_options=ChatCompletionInputStreamOptions(include_usage=True),
596
            **generation_kwargs,
597
        )
598

599
        component_info = ComponentInfo.from_component(self)
1✔
600
        streaming_chunks: list[StreamingChunk] = []
1✔
601
        async for chunk in api_output:
1✔
602
            stream_chunk = _convert_chat_completion_stream_output_to_streaming_chunk(
1✔
603
                chunk=chunk, previous_chunks=streaming_chunks, component_info=component_info
604
            )
605
            streaming_chunks.append(stream_chunk)
1✔
606
            await streaming_callback(stream_chunk)
1✔
607

608
        message = _convert_streaming_chunks_to_chat_message(chunks=streaming_chunks)
1✔
609
        if message.meta.get("usage") is None:
1✔
610
            message.meta["usage"] = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
611

612
        return {"replies": [message]}
1✔
613

614
    async def _run_non_streaming_async(
1✔
615
        self,
616
        messages: list[dict[str, str]],
617
        generation_kwargs: dict[str, Any],
618
        tools: Optional[list["ChatCompletionInputTool"]] = None,
619
    ) -> dict[str, list[ChatMessage]]:
620
        api_chat_output: ChatCompletionOutput = await self._async_client.chat_completion(
1✔
621
            messages=messages, tools=tools, **generation_kwargs
622
        )
623

624
        if api_chat_output.choices is None or len(api_chat_output.choices) == 0:
1✔
625
            return {"replies": []}
×
626

627
        choice = api_chat_output.choices[0]
1✔
628

629
        text = choice.message.content
1✔
630

631
        tool_calls = _convert_hfapi_tool_calls(choice.message.tool_calls)
1✔
632

633
        mapped_finish_reason = _map_hf_finish_reason_to_haystack(choice) if choice.finish_reason else None
1✔
634
        meta: dict[str, Any] = {
1✔
635
            "model": self._async_client.model,
636
            "finish_reason": mapped_finish_reason,
637
            "index": choice.index,
638
        }
639

640
        usage = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
641
        if api_chat_output.usage:
1✔
642
            usage = {
1✔
643
                "prompt_tokens": api_chat_output.usage.prompt_tokens,
644
                "completion_tokens": api_chat_output.usage.completion_tokens,
645
            }
646
        meta["usage"] = usage
1✔
647

648
        message = ChatMessage.from_assistant(text=text, tool_calls=tool_calls, meta=meta)
1✔
649
        return {"replies": [message]}
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc