• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 15043437065

15 May 2025 11:09AM UTC coverage: 90.443% (-0.02%) from 90.465%
15043437065

push

github

web-flow
feat: Add `usage` when using `HuggingFaceAPIChatGenerator` with streaming (#9371)

* Small fix and update tests

* Add usage support to streaming for HuggingFaceAPIChatGenerator

* Add reno

* try using provider='auto'

* Undo provider

* Fix unit tests

* Update releasenotes/notes/add-usage-hf-api-chat-streaming-91fd04705f45d5b3.yaml

Co-authored-by: Julian Risch <julian.risch@deepset.ai>

---------

Co-authored-by: anakin87 <stefanofiorucci@gmail.com>
Co-authored-by: Julian Risch <julian.risch@deepset.ai>

10940 of 12096 relevant lines covered (90.44%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.0
haystack/components/generators/chat/hugging_face_api.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
import json
1✔
6
from datetime import datetime
1✔
7
from typing import Any, AsyncIterable, Dict, Iterable, List, Optional, Union
1✔
8

9
from haystack import component, default_from_dict, default_to_dict, logging
1✔
10
from haystack.dataclasses import ChatMessage, StreamingChunk, ToolCall, select_streaming_callback
1✔
11
from haystack.dataclasses.streaming_chunk import StreamingCallbackT
1✔
12
from haystack.lazy_imports import LazyImport
1✔
13
from haystack.tools import (
1✔
14
    Tool,
15
    Toolset,
16
    _check_duplicate_tool_names,
17
    deserialize_tools_or_toolset_inplace,
18
    serialize_tools_or_toolset,
19
)
20
from haystack.utils import Secret, deserialize_callable, deserialize_secrets_inplace, serialize_callable
1✔
21
from haystack.utils.hf import HFGenerationAPIType, HFModelType, check_valid_model, convert_message_to_hf_format
1✔
22
from haystack.utils.url_validation import is_valid_http_url
1✔
23

24
logger = logging.getLogger(__name__)
1✔
25

26
with LazyImport(message="Run 'pip install \"huggingface_hub[inference]>=0.27.0\"'") as huggingface_hub_import:
1✔
27
    from huggingface_hub import (
1✔
28
        AsyncInferenceClient,
29
        ChatCompletionInputFunctionDefinition,
30
        ChatCompletionInputStreamOptions,
31
        ChatCompletionInputTool,
32
        ChatCompletionOutput,
33
        ChatCompletionOutputToolCall,
34
        ChatCompletionStreamOutput,
35
        InferenceClient,
36
    )
37

38

39
def _convert_hfapi_tool_calls(hfapi_tool_calls: Optional[List["ChatCompletionOutputToolCall"]]) -> List[ToolCall]:
1✔
40
    """
41
    Convert HuggingFace API tool calls to a list of Haystack ToolCall.
42

43
    :param hfapi_tool_calls: The HuggingFace API tool calls to convert.
44
    :returns: A list of ToolCall objects.
45

46
    """
47
    if not hfapi_tool_calls:
1✔
48
        return []
1✔
49

50
    tool_calls = []
1✔
51

52
    for hfapi_tc in hfapi_tool_calls:
1✔
53
        hf_arguments = hfapi_tc.function.arguments
1✔
54

55
        arguments = None
1✔
56
        if isinstance(hf_arguments, dict):
1✔
57
            arguments = hf_arguments
1✔
58
        elif isinstance(hf_arguments, str):
1✔
59
            try:
1✔
60
                arguments = json.loads(hf_arguments)
1✔
61
            except json.JSONDecodeError:
1✔
62
                logger.warning(
1✔
63
                    "HuggingFace API returned a malformed JSON string for tool call arguments. This tool call "
64
                    "will be skipped. Tool call ID: {_id}, Tool name: {_name}, Arguments: {_arguments}",
65
                    _id=hfapi_tc.id,
66
                    _name=hfapi_tc.function.name,
67
                    _arguments=hf_arguments,
68
                )
69
        else:
70
            logger.warning(
1✔
71
                "HuggingFace API returned tool call arguments of type {_type}. Valid types are dict and str. This tool "
72
                "call will be skipped. Tool call ID: {_id}, Tool name: {_name}, Arguments: {_arguments}",
73
                _id=hfapi_tc.id,
74
                _name=hfapi_tc.function.name,
75
                _arguments=hf_arguments,
76
            )
77

78
        if arguments:
1✔
79
            tool_calls.append(ToolCall(tool_name=hfapi_tc.function.name, arguments=arguments, id=hfapi_tc.id))
1✔
80

81
    return tool_calls
1✔
82

83

84
def _convert_tools_to_hfapi_tools(
1✔
85
    tools: Optional[Union[List[Tool], Toolset]],
86
) -> Optional[List["ChatCompletionInputTool"]]:
87
    if not tools:
1✔
88
        return None
1✔
89

90
    # huggingface_hub<0.31.0 uses "arguments", huggingface_hub>=0.31.0 uses "parameters"
91
    parameters_name = "arguments" if hasattr(ChatCompletionInputFunctionDefinition, "arguments") else "parameters"
1✔
92

93
    hf_tools = []
1✔
94
    for tool in tools:
1✔
95
        hf_tools_args = {"name": tool.name, "description": tool.description, parameters_name: tool.parameters}
1✔
96

97
        hf_tools.append(
1✔
98
            ChatCompletionInputTool(function=ChatCompletionInputFunctionDefinition(**hf_tools_args), type="function")
99
        )
100

101
    return hf_tools
1✔
102

103

104
@component
1✔
105
class HuggingFaceAPIChatGenerator:
1✔
106
    """
107
    Completes chats using Hugging Face APIs.
108

109
    HuggingFaceAPIChatGenerator uses the [ChatMessage](https://docs.haystack.deepset.ai/docs/chatmessage)
110
    format for input and output. Use it to generate text with Hugging Face APIs:
111
    - [Free Serverless Inference API](https://huggingface.co/inference-api)
112
    - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints)
113
    - [Self-hosted Text Generation Inference](https://github.com/huggingface/text-generation-inference)
114

115
    ### Usage examples
116

117
    #### With the free serverless inference API
118

119
    ```python
120
    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
121
    from haystack.dataclasses import ChatMessage
122
    from haystack.utils import Secret
123
    from haystack.utils.hf import HFGenerationAPIType
124

125
    messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"),
126
                ChatMessage.from_user("What's Natural Language Processing?")]
127

128
    # the api_type can be expressed using the HFGenerationAPIType enum or as a string
129
    api_type = HFGenerationAPIType.SERVERLESS_INFERENCE_API
130
    api_type = "serverless_inference_api" # this is equivalent to the above
131

132
    generator = HuggingFaceAPIChatGenerator(api_type=api_type,
133
                                            api_params={"model": "HuggingFaceH4/zephyr-7b-beta"},
134
                                            token=Secret.from_token("<your-api-key>"))
135

136
    result = generator.run(messages)
137
    print(result)
138
    ```
139

140
    #### With paid inference endpoints
141

142
    ```python
143
    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
144
    from haystack.dataclasses import ChatMessage
145
    from haystack.utils import Secret
146

147
    messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"),
148
                ChatMessage.from_user("What's Natural Language Processing?")]
149

150
    generator = HuggingFaceAPIChatGenerator(api_type="inference_endpoints",
151
                                            api_params={"url": "<your-inference-endpoint-url>"},
152
                                            token=Secret.from_token("<your-api-key>"))
153

154
    result = generator.run(messages)
155
    print(result)
156

157
    #### With self-hosted text generation inference
158

159
    ```python
160
    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
161
    from haystack.dataclasses import ChatMessage
162

163
    messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"),
164
                ChatMessage.from_user("What's Natural Language Processing?")]
165

166
    generator = HuggingFaceAPIChatGenerator(api_type="text_generation_inference",
167
                                            api_params={"url": "http://localhost:8080"})
168

169
    result = generator.run(messages)
170
    print(result)
171
    ```
172
    """
173

174
    def __init__(  # pylint: disable=too-many-positional-arguments
1✔
175
        self,
176
        api_type: Union[HFGenerationAPIType, str],
177
        api_params: Dict[str, str],
178
        token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
179
        generation_kwargs: Optional[Dict[str, Any]] = None,
180
        stop_words: Optional[List[str]] = None,
181
        streaming_callback: Optional[StreamingCallbackT] = None,
182
        tools: Optional[Union[List[Tool], Toolset]] = None,
183
    ):
184
        """
185
        Initialize the HuggingFaceAPIChatGenerator instance.
186

187
        :param api_type:
188
            The type of Hugging Face API to use. Available types:
189
            - `text_generation_inference`: See [TGI](https://github.com/huggingface/text-generation-inference).
190
            - `inference_endpoints`: See [Inference Endpoints](https://huggingface.co/inference-endpoints).
191
            - `serverless_inference_api`: See [Serverless Inference API](https://huggingface.co/inference-api).
192
        :param api_params:
193
            A dictionary with the following keys:
194
            - `model`: Hugging Face model ID. Required when `api_type` is `SERVERLESS_INFERENCE_API`.
195
            - `url`: URL of the inference endpoint. Required when `api_type` is `INFERENCE_ENDPOINTS` or
196
            `TEXT_GENERATION_INFERENCE`.
197
        :param token:
198
            The Hugging Face token to use as HTTP bearer authorization.
199
            Check your HF token in your [account settings](https://huggingface.co/settings/tokens).
200
        :param generation_kwargs:
201
            A dictionary with keyword arguments to customize text generation.
202
                Some examples: `max_tokens`, `temperature`, `top_p`.
203
                For details, see [Hugging Face chat_completion documentation](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion).
204
        :param stop_words:
205
            An optional list of strings representing the stop words.
206
        :param streaming_callback:
207
            An optional callable for handling streaming responses.
208
        :param tools:
209
            A list of tools or a Toolset for which the model can prepare calls.
210
            The chosen model should support tool/function calling, according to the model card.
211
            Support for tools in the Hugging Face API and TGI is not yet fully refined and you may experience
212
            unexpected behavior. This parameter can accept either a list of `Tool` objects or a `Toolset` instance.
213
        """
214

215
        huggingface_hub_import.check()
1✔
216

217
        if isinstance(api_type, str):
1✔
218
            api_type = HFGenerationAPIType.from_str(api_type)
1✔
219

220
        if api_type == HFGenerationAPIType.SERVERLESS_INFERENCE_API:
1✔
221
            model = api_params.get("model")
1✔
222
            if model is None:
1✔
223
                raise ValueError(
1✔
224
                    "To use the Serverless Inference API, you need to specify the `model` parameter in `api_params`."
225
                )
226
            check_valid_model(model, HFModelType.GENERATION, token)
1✔
227
            model_or_url = model
1✔
228
        elif api_type in [HFGenerationAPIType.INFERENCE_ENDPOINTS, HFGenerationAPIType.TEXT_GENERATION_INFERENCE]:
1✔
229
            url = api_params.get("url")
1✔
230
            if url is None:
1✔
231
                msg = (
1✔
232
                    "To use Text Generation Inference or Inference Endpoints, you need to specify the `url` parameter "
233
                    "in `api_params`."
234
                )
235
                raise ValueError(msg)
1✔
236
            if not is_valid_http_url(url):
1✔
237
                raise ValueError(f"Invalid URL: {url}")
1✔
238
            model_or_url = url
1✔
239
        else:
240
            msg = f"Unknown api_type {api_type}"
×
241
            raise ValueError(msg)
×
242

243
        if tools and streaming_callback is not None:
1✔
244
            raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.")
1✔
245
        _check_duplicate_tool_names(list(tools or []))
1✔
246

247
        # handle generation kwargs setup
248
        generation_kwargs = generation_kwargs.copy() if generation_kwargs else {}
1✔
249
        generation_kwargs["stop"] = generation_kwargs.get("stop", [])
1✔
250
        generation_kwargs["stop"].extend(stop_words or [])
1✔
251
        generation_kwargs.setdefault("max_tokens", 512)
1✔
252

253
        self.api_type = api_type
1✔
254
        self.api_params = api_params
1✔
255
        self.token = token
1✔
256
        self.generation_kwargs = generation_kwargs
1✔
257
        self.streaming_callback = streaming_callback
1✔
258
        self._client = InferenceClient(model_or_url, token=token.resolve_value() if token else None)
1✔
259
        self._async_client = AsyncInferenceClient(model_or_url, token=token.resolve_value() if token else None)
1✔
260
        self.tools = tools
1✔
261

262
    def to_dict(self) -> Dict[str, Any]:
1✔
263
        """
264
        Serialize this component to a dictionary.
265

266
        :returns:
267
            A dictionary containing the serialized component.
268
        """
269
        callback_name = serialize_callable(self.streaming_callback) if self.streaming_callback else None
1✔
270
        return default_to_dict(
1✔
271
            self,
272
            api_type=str(self.api_type),
273
            api_params=self.api_params,
274
            token=self.token.to_dict() if self.token else None,
275
            generation_kwargs=self.generation_kwargs,
276
            streaming_callback=callback_name,
277
            tools=serialize_tools_or_toolset(self.tools),
278
        )
279

280
    @classmethod
1✔
281
    def from_dict(cls, data: Dict[str, Any]) -> "HuggingFaceAPIChatGenerator":
1✔
282
        """
283
        Deserialize this component from a dictionary.
284
        """
285
        deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
1✔
286
        deserialize_tools_or_toolset_inplace(data["init_parameters"], key="tools")
1✔
287
        init_params = data.get("init_parameters", {})
1✔
288
        serialized_callback_handler = init_params.get("streaming_callback")
1✔
289
        if serialized_callback_handler:
1✔
290
            data["init_parameters"]["streaming_callback"] = deserialize_callable(serialized_callback_handler)
×
291
        return default_from_dict(cls, data)
1✔
292

293
    @component.output_types(replies=List[ChatMessage])
1✔
294
    def run(
1✔
295
        self,
296
        messages: List[ChatMessage],
297
        generation_kwargs: Optional[Dict[str, Any]] = None,
298
        tools: Optional[Union[List[Tool], Toolset]] = None,
299
        streaming_callback: Optional[StreamingCallbackT] = None,
300
    ):
301
        """
302
        Invoke the text generation inference based on the provided messages and generation parameters.
303

304
        :param messages:
305
            A list of ChatMessage objects representing the input messages.
306
        :param generation_kwargs:
307
            Additional keyword arguments for text generation.
308
        :param tools:
309
            A list of tools or a Toolset for which the model can prepare calls. If set, it will override
310
            the `tools` parameter set during component initialization. This parameter can accept either a
311
            list of `Tool` objects or a `Toolset` instance.
312
        :param streaming_callback:
313
            An optional callable for handling streaming responses. If set, it will override the `streaming_callback`
314
            parameter set during component initialization.
315
        :returns: A dictionary with the following keys:
316
            - `replies`: A list containing the generated responses as ChatMessage objects.
317
        """
318

319
        # update generation kwargs by merging with the default ones
320
        generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
1✔
321

322
        formatted_messages = [convert_message_to_hf_format(message) for message in messages]
1✔
323

324
        tools = tools or self.tools
1✔
325
        if tools and self.streaming_callback:
1✔
326
            raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.")
1✔
327
        _check_duplicate_tool_names(list(tools or []))
1✔
328

329
        # validate and select the streaming callback
330
        streaming_callback = select_streaming_callback(
1✔
331
            self.streaming_callback, streaming_callback, requires_async=False
332
        )
333

334
        if streaming_callback:
1✔
335
            return self._run_streaming(formatted_messages, generation_kwargs, streaming_callback)
1✔
336

337
        if tools and isinstance(tools, Toolset):
1✔
338
            tools = list(tools)
×
339

340
        hf_tools = _convert_tools_to_hfapi_tools(tools)
1✔
341

342
        return self._run_non_streaming(formatted_messages, generation_kwargs, hf_tools)
1✔
343

344
    @component.output_types(replies=List[ChatMessage])
1✔
345
    async def run_async(
1✔
346
        self,
347
        messages: List[ChatMessage],
348
        generation_kwargs: Optional[Dict[str, Any]] = None,
349
        tools: Optional[Union[List[Tool], Toolset]] = None,
350
        streaming_callback: Optional[StreamingCallbackT] = None,
351
    ):
352
        """
353
        Asynchronously invokes the text generation inference based on the provided messages and generation parameters.
354

355
        This is the asynchronous version of the `run` method. It has the same parameters
356
        and return values but can be used with `await` in an async code.
357

358
        :param messages:
359
            A list of ChatMessage objects representing the input messages.
360
        :param generation_kwargs:
361
            Additional keyword arguments for text generation.
362
        :param tools:
363
            A list of tools or a Toolset for which the model can prepare calls. If set, it will override the `tools`
364
            parameter set during component initialization. This parameter can accept either a list of `Tool` objects
365
            or a `Toolset` instance.
366
        :param streaming_callback:
367
            An optional callable for handling streaming responses. If set, it will override the `streaming_callback`
368
            parameter set during component initialization.
369
        :returns: A dictionary with the following keys:
370
            - `replies`: A list containing the generated responses as ChatMessage objects.
371
        """
372

373
        # update generation kwargs by merging with the default ones
374
        generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
1✔
375

376
        formatted_messages = [convert_message_to_hf_format(message) for message in messages]
1✔
377

378
        tools = tools or self.tools
1✔
379
        if tools and self.streaming_callback:
1✔
380
            raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.")
×
381
        _check_duplicate_tool_names(list(tools or []))
1✔
382

383
        # validate and select the streaming callback
384
        streaming_callback = select_streaming_callback(self.streaming_callback, streaming_callback, requires_async=True)
1✔
385

386
        if streaming_callback:
1✔
387
            return await self._run_streaming_async(formatted_messages, generation_kwargs, streaming_callback)
1✔
388

389
        if tools and isinstance(tools, Toolset):
1✔
390
            tools = list(tools)
×
391

392
        hf_tools = _convert_tools_to_hfapi_tools(tools)
1✔
393

394
        return await self._run_non_streaming_async(formatted_messages, generation_kwargs, hf_tools)
1✔
395

396
    def _run_streaming(
1✔
397
        self, messages: List[Dict[str, str]], generation_kwargs: Dict[str, Any], streaming_callback: StreamingCallbackT
398
    ):
399
        api_output: Iterable[ChatCompletionStreamOutput] = self._client.chat_completion(
1✔
400
            messages,
401
            stream=True,
402
            stream_options=ChatCompletionInputStreamOptions(include_usage=True),
403
            **generation_kwargs,
404
        )
405

406
        generated_text = ""
1✔
407
        first_chunk_time = None
1✔
408
        finish_reason = None
1✔
409
        usage = None
1✔
410
        meta: Dict[str, Any] = {}
1✔
411

412
        for chunk in api_output:
1✔
413
            # The chunk with usage returns an empty array for choices
414
            if len(chunk.choices) > 0:
1✔
415
                # n is unused, so the API always returns only one choice
416
                # the argument is probably allowed for compatibility with OpenAI
417
                # see https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.n
418
                choice = chunk.choices[0]
1✔
419

420
                text = choice.delta.content or ""
1✔
421
                generated_text += text
1✔
422

423
                if choice.finish_reason:
1✔
424
                    finish_reason = choice.finish_reason
1✔
425

426
                stream_chunk = StreamingChunk(text, meta)
1✔
427
                streaming_callback(stream_chunk)
1✔
428

429
            if chunk.usage:
1✔
430
                usage = chunk.usage
×
431

432
            if first_chunk_time is None:
1✔
433
                first_chunk_time = datetime.now().isoformat()
1✔
434

435
        if usage:
1✔
436
            usage_dict = {"prompt_tokens": usage.prompt_tokens, "completion_tokens": usage.completion_tokens}
×
437
        else:
438
            usage_dict = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
439

440
        meta.update(
1✔
441
            {
442
                "model": self._client.model,
443
                "index": 0,
444
                "finish_reason": finish_reason,
445
                "usage": usage_dict,
446
                "completion_start_time": first_chunk_time,
447
            }
448
        )
449

450
        message = ChatMessage.from_assistant(text=generated_text, meta=meta)
1✔
451
        return {"replies": [message]}
1✔
452

453
    def _run_non_streaming(
1✔
454
        self,
455
        messages: List[Dict[str, str]],
456
        generation_kwargs: Dict[str, Any],
457
        tools: Optional[List["ChatCompletionInputTool"]] = None,
458
    ) -> Dict[str, List[ChatMessage]]:
459
        api_chat_output: ChatCompletionOutput = self._client.chat_completion(
1✔
460
            messages=messages, tools=tools, **generation_kwargs
461
        )
462

463
        if len(api_chat_output.choices) == 0:
1✔
464
            return {"replies": []}
×
465

466
        # n is unused, so the API always returns only one choice
467
        # the argument is probably allowed for compatibility with OpenAI
468
        # see https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.n
469
        choice = api_chat_output.choices[0]
1✔
470

471
        text = choice.message.content
1✔
472

473
        tool_calls = _convert_hfapi_tool_calls(choice.message.tool_calls)
1✔
474

475
        meta: Dict[str, Any] = {
1✔
476
            "model": self._client.model,
477
            "finish_reason": choice.finish_reason,
478
            "index": choice.index,
479
        }
480

481
        usage = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
482
        if api_chat_output.usage:
1✔
483
            usage = {
1✔
484
                "prompt_tokens": api_chat_output.usage.prompt_tokens,
485
                "completion_tokens": api_chat_output.usage.completion_tokens,
486
            }
487
        meta["usage"] = usage
1✔
488

489
        message = ChatMessage.from_assistant(text=text, tool_calls=tool_calls, meta=meta)
1✔
490
        return {"replies": [message]}
1✔
491

492
    async def _run_streaming_async(
1✔
493
        self, messages: List[Dict[str, str]], generation_kwargs: Dict[str, Any], streaming_callback: StreamingCallbackT
494
    ):
495
        api_output: AsyncIterable[ChatCompletionStreamOutput] = await self._async_client.chat_completion(
1✔
496
            messages,
497
            stream=True,
498
            stream_options=ChatCompletionInputStreamOptions(include_usage=True),
499
            **generation_kwargs,
500
        )
501

502
        generated_text = ""
1✔
503
        first_chunk_time = None
1✔
504
        finish_reason = None
1✔
505
        usage = None
1✔
506
        meta: Dict[str, Any] = {}
1✔
507

508
        async for chunk in api_output:
1✔
509
            # The chunk with usage returns an empty array for choices
510
            if len(chunk.choices) > 0:
1✔
511
                # n is unused, so the API always returns only one choice
512
                # the argument is probably allowed for compatibility with OpenAI
513
                # see https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.n
514
                choice = chunk.choices[0]
1✔
515

516
                text = choice.delta.content or ""
1✔
517
                generated_text += text
1✔
518

519
                if choice.finish_reason:
1✔
520
                    finish_reason = choice.finish_reason
1✔
521

522
                stream_chunk = StreamingChunk(text, meta)
1✔
523
                await streaming_callback(stream_chunk)  # type: ignore
1✔
524

525
            if chunk.usage:
1✔
526
                usage = chunk.usage
×
527

528
            if first_chunk_time is None:
1✔
529
                first_chunk_time = datetime.now().isoformat()
1✔
530

531
        if usage:
1✔
532
            usage_dict = {"prompt_tokens": usage.prompt_tokens, "completion_tokens": usage.completion_tokens}
×
533
        else:
534
            usage_dict = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
535

536
        meta.update(
1✔
537
            {
538
                "model": self._async_client.model,
539
                "index": 0,
540
                "finish_reason": finish_reason,
541
                "usage": usage_dict,
542
                "completion_start_time": first_chunk_time,
543
            }
544
        )
545

546
        message = ChatMessage.from_assistant(text=generated_text, meta=meta)
1✔
547
        return {"replies": [message]}
1✔
548

549
    async def _run_non_streaming_async(
1✔
550
        self,
551
        messages: List[Dict[str, str]],
552
        generation_kwargs: Dict[str, Any],
553
        tools: Optional[List["ChatCompletionInputTool"]] = None,
554
    ) -> Dict[str, List[ChatMessage]]:
555
        api_chat_output: ChatCompletionOutput = await self._async_client.chat_completion(
1✔
556
            messages=messages, tools=tools, **generation_kwargs
557
        )
558

559
        if len(api_chat_output.choices) == 0:
1✔
560
            return {"replies": []}
×
561

562
        choice = api_chat_output.choices[0]
1✔
563

564
        text = choice.message.content
1✔
565

566
        tool_calls = _convert_hfapi_tool_calls(choice.message.tool_calls)
1✔
567

568
        meta: Dict[str, Any] = {
1✔
569
            "model": self._async_client.model,
570
            "finish_reason": choice.finish_reason,
571
            "index": choice.index,
572
        }
573

574
        usage = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
575
        if api_chat_output.usage:
1✔
576
            usage = {
1✔
577
                "prompt_tokens": api_chat_output.usage.prompt_tokens,
578
                "completion_tokens": api_chat_output.usage.completion_tokens,
579
            }
580
        meta["usage"] = usage
1✔
581

582
        message = ChatMessage.from_assistant(text=text, tool_calls=tool_calls, meta=meta)
1✔
583
        return {"replies": [message]}
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc