• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 15254528833

26 May 2025 12:56PM UTC coverage: 90.146% (-0.3%) from 90.411%
15254528833

Pull #9426

github

web-flow
Merge 06c2b66b1 into 802328e29
Pull Request #9426: feat: add component name and type to `StreamingChunk`

11398 of 12644 relevant lines covered (90.15%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.15
haystack/components/generators/chat/hugging_face_api.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
import json
1✔
6
from datetime import datetime
1✔
7
from typing import Any, AsyncIterable, Dict, Iterable, List, Optional, Union
1✔
8

9
from haystack import component, default_from_dict, default_to_dict, logging
1✔
10
from haystack.dataclasses import ChatMessage, ComponentInfo, StreamingChunk, ToolCall, select_streaming_callback
1✔
11
from haystack.dataclasses.streaming_chunk import StreamingCallbackT
1✔
12
from haystack.lazy_imports import LazyImport
1✔
13
from haystack.tools import (
1✔
14
    Tool,
15
    Toolset,
16
    _check_duplicate_tool_names,
17
    deserialize_tools_or_toolset_inplace,
18
    serialize_tools_or_toolset,
19
)
20
from haystack.utils import Secret, deserialize_callable, deserialize_secrets_inplace, serialize_callable
1✔
21
from haystack.utils.hf import HFGenerationAPIType, HFModelType, check_valid_model, convert_message_to_hf_format
1✔
22
from haystack.utils.url_validation import is_valid_http_url
1✔
23

24
logger = logging.getLogger(__name__)
1✔
25

26
with LazyImport(message="Run 'pip install \"huggingface_hub[inference]>=0.27.0\"'") as huggingface_hub_import:
1✔
27
    from huggingface_hub import (
1✔
28
        AsyncInferenceClient,
29
        ChatCompletionInputFunctionDefinition,
30
        ChatCompletionInputStreamOptions,
31
        ChatCompletionInputTool,
32
        ChatCompletionOutput,
33
        ChatCompletionOutputToolCall,
34
        ChatCompletionStreamOutput,
35
        InferenceClient,
36
    )
37

38

39
def _convert_hfapi_tool_calls(hfapi_tool_calls: Optional[List["ChatCompletionOutputToolCall"]]) -> List[ToolCall]:
1✔
40
    """
41
    Convert HuggingFace API tool calls to a list of Haystack ToolCall.
42

43
    :param hfapi_tool_calls: The HuggingFace API tool calls to convert.
44
    :returns: A list of ToolCall objects.
45

46
    """
47
    if not hfapi_tool_calls:
1✔
48
        return []
1✔
49

50
    tool_calls = []
1✔
51

52
    for hfapi_tc in hfapi_tool_calls:
1✔
53
        hf_arguments = hfapi_tc.function.arguments
1✔
54

55
        arguments = None
1✔
56
        if isinstance(hf_arguments, dict):
1✔
57
            arguments = hf_arguments
1✔
58
        elif isinstance(hf_arguments, str):
1✔
59
            try:
1✔
60
                arguments = json.loads(hf_arguments)
1✔
61
            except json.JSONDecodeError:
1✔
62
                logger.warning(
1✔
63
                    "HuggingFace API returned a malformed JSON string for tool call arguments. This tool call "
64
                    "will be skipped. Tool call ID: {_id}, Tool name: {_name}, Arguments: {_arguments}",
65
                    _id=hfapi_tc.id,
66
                    _name=hfapi_tc.function.name,
67
                    _arguments=hf_arguments,
68
                )
69
        else:
70
            logger.warning(
1✔
71
                "HuggingFace API returned tool call arguments of type {_type}. Valid types are dict and str. This tool "
72
                "call will be skipped. Tool call ID: {_id}, Tool name: {_name}, Arguments: {_arguments}",
73
                _id=hfapi_tc.id,
74
                _name=hfapi_tc.function.name,
75
                _arguments=hf_arguments,
76
            )
77

78
        if arguments:
1✔
79
            tool_calls.append(ToolCall(tool_name=hfapi_tc.function.name, arguments=arguments, id=hfapi_tc.id))
1✔
80

81
    return tool_calls
1✔
82

83

84
def _convert_tools_to_hfapi_tools(
1✔
85
    tools: Optional[Union[List[Tool], Toolset]],
86
) -> Optional[List["ChatCompletionInputTool"]]:
87
    if not tools:
1✔
88
        return None
1✔
89

90
    # huggingface_hub<0.31.0 uses "arguments", huggingface_hub>=0.31.0 uses "parameters"
91
    parameters_name = "arguments" if hasattr(ChatCompletionInputFunctionDefinition, "arguments") else "parameters"
1✔
92

93
    hf_tools = []
1✔
94
    for tool in tools:
1✔
95
        hf_tools_args = {"name": tool.name, "description": tool.description, parameters_name: tool.parameters}
1✔
96

97
        hf_tools.append(
1✔
98
            ChatCompletionInputTool(function=ChatCompletionInputFunctionDefinition(**hf_tools_args), type="function")
99
        )
100

101
    return hf_tools
1✔
102

103

104
@component
1✔
105
class HuggingFaceAPIChatGenerator:
1✔
106
    """
107
    Completes chats using Hugging Face APIs.
108

109
    HuggingFaceAPIChatGenerator uses the [ChatMessage](https://docs.haystack.deepset.ai/docs/chatmessage)
110
    format for input and output. Use it to generate text with Hugging Face APIs:
111
    - [Free Serverless Inference API](https://huggingface.co/inference-api)
112
    - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints)
113
    - [Self-hosted Text Generation Inference](https://github.com/huggingface/text-generation-inference)
114

115
    ### Usage examples
116

117
    #### With the free serverless inference API
118

119
    ```python
120
    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
121
    from haystack.dataclasses import ChatMessage
122
    from haystack.utils import Secret
123
    from haystack.utils.hf import HFGenerationAPIType
124

125
    messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"),
126
                ChatMessage.from_user("What's Natural Language Processing?")]
127

128
    # the api_type can be expressed using the HFGenerationAPIType enum or as a string
129
    api_type = HFGenerationAPIType.SERVERLESS_INFERENCE_API
130
    api_type = "serverless_inference_api" # this is equivalent to the above
131

132
    generator = HuggingFaceAPIChatGenerator(api_type=api_type,
133
                                            api_params={"model": "HuggingFaceH4/zephyr-7b-beta"},
134
                                            token=Secret.from_token("<your-api-key>"))
135

136
    result = generator.run(messages)
137
    print(result)
138
    ```
139

140
    #### With paid inference endpoints
141

142
    ```python
143
    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
144
    from haystack.dataclasses import ChatMessage
145
    from haystack.utils import Secret
146

147
    messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"),
148
                ChatMessage.from_user("What's Natural Language Processing?")]
149

150
    generator = HuggingFaceAPIChatGenerator(api_type="inference_endpoints",
151
                                            api_params={"url": "<your-inference-endpoint-url>"},
152
                                            token=Secret.from_token("<your-api-key>"))
153

154
    result = generator.run(messages)
155
    print(result)
156

157
    #### With self-hosted text generation inference
158

159
    ```python
160
    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
161
    from haystack.dataclasses import ChatMessage
162

163
    messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"),
164
                ChatMessage.from_user("What's Natural Language Processing?")]
165

166
    generator = HuggingFaceAPIChatGenerator(api_type="text_generation_inference",
167
                                            api_params={"url": "http://localhost:8080"})
168

169
    result = generator.run(messages)
170
    print(result)
171
    ```
172
    """
173

174
    # Type annotation for the component name
175
    __component_name__: str
1✔
176

177
    def __init__(  # pylint: disable=too-many-positional-arguments
1✔
178
        self,
179
        api_type: Union[HFGenerationAPIType, str],
180
        api_params: Dict[str, str],
181
        token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
182
        generation_kwargs: Optional[Dict[str, Any]] = None,
183
        stop_words: Optional[List[str]] = None,
184
        streaming_callback: Optional[StreamingCallbackT] = None,
185
        tools: Optional[Union[List[Tool], Toolset]] = None,
186
    ):
187
        """
188
        Initialize the HuggingFaceAPIChatGenerator instance.
189

190
        :param api_type:
191
            The type of Hugging Face API to use. Available types:
192
            - `text_generation_inference`: See [TGI](https://github.com/huggingface/text-generation-inference).
193
            - `inference_endpoints`: See [Inference Endpoints](https://huggingface.co/inference-endpoints).
194
            - `serverless_inference_api`: See [Serverless Inference API](https://huggingface.co/inference-api).
195
        :param api_params:
196
            A dictionary with the following keys:
197
            - `model`: Hugging Face model ID. Required when `api_type` is `SERVERLESS_INFERENCE_API`.
198
            - `url`: URL of the inference endpoint. Required when `api_type` is `INFERENCE_ENDPOINTS` or
199
            `TEXT_GENERATION_INFERENCE`.
200
        :param token:
201
            The Hugging Face token to use as HTTP bearer authorization.
202
            Check your HF token in your [account settings](https://huggingface.co/settings/tokens).
203
        :param generation_kwargs:
204
            A dictionary with keyword arguments to customize text generation.
205
                Some examples: `max_tokens`, `temperature`, `top_p`.
206
                For details, see [Hugging Face chat_completion documentation](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion).
207
        :param stop_words:
208
            An optional list of strings representing the stop words.
209
        :param streaming_callback:
210
            An optional callable for handling streaming responses.
211
        :param tools:
212
            A list of tools or a Toolset for which the model can prepare calls.
213
            The chosen model should support tool/function calling, according to the model card.
214
            Support for tools in the Hugging Face API and TGI is not yet fully refined and you may experience
215
            unexpected behavior. This parameter can accept either a list of `Tool` objects or a `Toolset` instance.
216
        """
217

218
        huggingface_hub_import.check()
1✔
219

220
        if isinstance(api_type, str):
1✔
221
            api_type = HFGenerationAPIType.from_str(api_type)
1✔
222

223
        if api_type == HFGenerationAPIType.SERVERLESS_INFERENCE_API:
1✔
224
            model = api_params.get("model")
1✔
225
            if model is None:
1✔
226
                raise ValueError(
1✔
227
                    "To use the Serverless Inference API, you need to specify the `model` parameter in `api_params`."
228
                )
229
            check_valid_model(model, HFModelType.GENERATION, token)
1✔
230
            model_or_url = model
1✔
231
        elif api_type in [HFGenerationAPIType.INFERENCE_ENDPOINTS, HFGenerationAPIType.TEXT_GENERATION_INFERENCE]:
1✔
232
            url = api_params.get("url")
1✔
233
            if url is None:
1✔
234
                msg = (
1✔
235
                    "To use Text Generation Inference or Inference Endpoints, you need to specify the `url` parameter "
236
                    "in `api_params`."
237
                )
238
                raise ValueError(msg)
1✔
239
            if not is_valid_http_url(url):
1✔
240
                raise ValueError(f"Invalid URL: {url}")
1✔
241
            model_or_url = url
1✔
242
        else:
243
            msg = f"Unknown api_type {api_type}"
×
244
            raise ValueError(msg)
×
245

246
        if tools and streaming_callback is not None:
1✔
247
            raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.")
1✔
248
        _check_duplicate_tool_names(list(tools or []))
1✔
249

250
        # handle generation kwargs setup
251
        generation_kwargs = generation_kwargs.copy() if generation_kwargs else {}
1✔
252
        generation_kwargs["stop"] = generation_kwargs.get("stop", [])
1✔
253
        generation_kwargs["stop"].extend(stop_words or [])
1✔
254
        generation_kwargs.setdefault("max_tokens", 512)
1✔
255

256
        self.api_type = api_type
1✔
257
        self.api_params = api_params
1✔
258
        self.token = token
1✔
259
        self.generation_kwargs = generation_kwargs
1✔
260
        self.streaming_callback = streaming_callback
1✔
261
        self._client = InferenceClient(model_or_url, token=token.resolve_value() if token else None)
1✔
262
        self._async_client = AsyncInferenceClient(model_or_url, token=token.resolve_value() if token else None)
1✔
263
        self.tools = tools
1✔
264

265
    def to_dict(self) -> Dict[str, Any]:
1✔
266
        """
267
        Serialize this component to a dictionary.
268

269
        :returns:
270
            A dictionary containing the serialized component.
271
        """
272
        callback_name = serialize_callable(self.streaming_callback) if self.streaming_callback else None
1✔
273
        return default_to_dict(
1✔
274
            self,
275
            api_type=str(self.api_type),
276
            api_params=self.api_params,
277
            token=self.token.to_dict() if self.token else None,
278
            generation_kwargs=self.generation_kwargs,
279
            streaming_callback=callback_name,
280
            tools=serialize_tools_or_toolset(self.tools),
281
        )
282

283
    @classmethod
1✔
284
    def from_dict(cls, data: Dict[str, Any]) -> "HuggingFaceAPIChatGenerator":
1✔
285
        """
286
        Deserialize this component from a dictionary.
287
        """
288
        deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
1✔
289
        deserialize_tools_or_toolset_inplace(data["init_parameters"], key="tools")
1✔
290
        init_params = data.get("init_parameters", {})
1✔
291
        serialized_callback_handler = init_params.get("streaming_callback")
1✔
292
        if serialized_callback_handler:
1✔
293
            data["init_parameters"]["streaming_callback"] = deserialize_callable(serialized_callback_handler)
×
294
        return default_from_dict(cls, data)
1✔
295

296
    @component.output_types(replies=List[ChatMessage])
1✔
297
    def run(
1✔
298
        self,
299
        messages: List[ChatMessage],
300
        generation_kwargs: Optional[Dict[str, Any]] = None,
301
        tools: Optional[Union[List[Tool], Toolset]] = None,
302
        streaming_callback: Optional[StreamingCallbackT] = None,
303
    ):
304
        """
305
        Invoke the text generation inference based on the provided messages and generation parameters.
306

307
        :param messages:
308
            A list of ChatMessage objects representing the input messages.
309
        :param generation_kwargs:
310
            Additional keyword arguments for text generation.
311
        :param tools:
312
            A list of tools or a Toolset for which the model can prepare calls. If set, it will override
313
            the `tools` parameter set during component initialization. This parameter can accept either a
314
            list of `Tool` objects or a `Toolset` instance.
315
        :param streaming_callback:
316
            An optional callable for handling streaming responses. If set, it will override the `streaming_callback`
317
            parameter set during component initialization.
318
        :returns: A dictionary with the following keys:
319
            - `replies`: A list containing the generated responses as ChatMessage objects.
320
        """
321

322
        # update generation kwargs by merging with the default ones
323
        generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
1✔
324

325
        formatted_messages = [convert_message_to_hf_format(message) for message in messages]
1✔
326

327
        tools = tools or self.tools
1✔
328
        if tools and self.streaming_callback:
1✔
329
            raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.")
1✔
330
        _check_duplicate_tool_names(list(tools or []))
1✔
331

332
        # validate and select the streaming callback
333
        streaming_callback = select_streaming_callback(
1✔
334
            self.streaming_callback, streaming_callback, requires_async=False
335
        )
336

337
        if streaming_callback:
1✔
338
            return self._run_streaming(formatted_messages, generation_kwargs, streaming_callback)
1✔
339

340
        if tools and isinstance(tools, Toolset):
1✔
341
            tools = list(tools)
×
342

343
        hf_tools = _convert_tools_to_hfapi_tools(tools)
1✔
344

345
        return self._run_non_streaming(formatted_messages, generation_kwargs, hf_tools)
1✔
346

347
    @component.output_types(replies=List[ChatMessage])
1✔
348
    async def run_async(
1✔
349
        self,
350
        messages: List[ChatMessage],
351
        generation_kwargs: Optional[Dict[str, Any]] = None,
352
        tools: Optional[Union[List[Tool], Toolset]] = None,
353
        streaming_callback: Optional[StreamingCallbackT] = None,
354
    ):
355
        """
356
        Asynchronously invokes the text generation inference based on the provided messages and generation parameters.
357

358
        This is the asynchronous version of the `run` method. It has the same parameters
359
        and return values but can be used with `await` in an async code.
360

361
        :param messages:
362
            A list of ChatMessage objects representing the input messages.
363
        :param generation_kwargs:
364
            Additional keyword arguments for text generation.
365
        :param tools:
366
            A list of tools or a Toolset for which the model can prepare calls. If set, it will override the `tools`
367
            parameter set during component initialization. This parameter can accept either a list of `Tool` objects
368
            or a `Toolset` instance.
369
        :param streaming_callback:
370
            An optional callable for handling streaming responses. If set, it will override the `streaming_callback`
371
            parameter set during component initialization.
372
        :returns: A dictionary with the following keys:
373
            - `replies`: A list containing the generated responses as ChatMessage objects.
374
        """
375

376
        # update generation kwargs by merging with the default ones
377
        generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
1✔
378

379
        formatted_messages = [convert_message_to_hf_format(message) for message in messages]
1✔
380

381
        tools = tools or self.tools
1✔
382
        if tools and self.streaming_callback:
1✔
383
            raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.")
×
384
        _check_duplicate_tool_names(list(tools or []))
1✔
385

386
        # validate and select the streaming callback
387
        streaming_callback = select_streaming_callback(self.streaming_callback, streaming_callback, requires_async=True)
1✔
388

389
        if streaming_callback:
1✔
390
            return await self._run_streaming_async(formatted_messages, generation_kwargs, streaming_callback)
1✔
391

392
        if tools and isinstance(tools, Toolset):
1✔
393
            tools = list(tools)
×
394

395
        hf_tools = _convert_tools_to_hfapi_tools(tools)
1✔
396

397
        return await self._run_non_streaming_async(formatted_messages, generation_kwargs, hf_tools)
1✔
398

399
    def _run_streaming(
1✔
400
        self, messages: List[Dict[str, str]], generation_kwargs: Dict[str, Any], streaming_callback: StreamingCallbackT
401
    ):
402
        api_output: Iterable[ChatCompletionStreamOutput] = self._client.chat_completion(
1✔
403
            messages,
404
            stream=True,
405
            stream_options=ChatCompletionInputStreamOptions(include_usage=True),
406
            **generation_kwargs,
407
        )
408

409
        generated_text = ""
1✔
410
        first_chunk_time = None
1✔
411
        finish_reason = None
1✔
412
        usage = None
1✔
413
        meta: Dict[str, Any] = {}
1✔
414

415
        # get component name and type
416
        component_name = self.__component_name__ if hasattr(self, "__component_name__") else None
1✔
417
        component_type = self.__class__.__module__ + "." + self.__class__.__name__
1✔
418
        component_info = ComponentInfo(name=component_name, type=component_type)
1✔
419

420
        # Set up streaming handler
421
        for chunk in api_output:
1✔
422
            # The chunk with usage returns an empty array for choices
423
            if len(chunk.choices) > 0:
1✔
424
                # n is unused, so the API always returns only one choice
425
                # the argument is probably allowed for compatibility with OpenAI
426
                # see https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.n
427
                choice = chunk.choices[0]
1✔
428

429
                text = choice.delta.content or ""
1✔
430
                generated_text += text
1✔
431

432
                if choice.finish_reason:
1✔
433
                    finish_reason = choice.finish_reason
1✔
434

435
                stream_chunk = StreamingChunk(text, meta, component_info)
1✔
436
                streaming_callback(stream_chunk)
1✔
437

438
            if chunk.usage:
1✔
439
                usage = chunk.usage
×
440

441
            if first_chunk_time is None:
1✔
442
                first_chunk_time = datetime.now().isoformat()
1✔
443

444
        if usage:
1✔
445
            usage_dict = {"prompt_tokens": usage.prompt_tokens, "completion_tokens": usage.completion_tokens}
×
446
        else:
447
            usage_dict = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
448

449
        meta.update(
1✔
450
            {
451
                "model": self._client.model,
452
                "index": 0,
453
                "finish_reason": finish_reason,
454
                "usage": usage_dict,
455
                "completion_start_time": first_chunk_time,
456
            }
457
        )
458

459
        message = ChatMessage.from_assistant(text=generated_text, meta=meta)
1✔
460
        return {"replies": [message]}
1✔
461

462
    def _run_non_streaming(
1✔
463
        self,
464
        messages: List[Dict[str, str]],
465
        generation_kwargs: Dict[str, Any],
466
        tools: Optional[List["ChatCompletionInputTool"]] = None,
467
    ) -> Dict[str, List[ChatMessage]]:
468
        api_chat_output: ChatCompletionOutput = self._client.chat_completion(
1✔
469
            messages=messages, tools=tools, **generation_kwargs
470
        )
471

472
        if len(api_chat_output.choices) == 0:
1✔
473
            return {"replies": []}
×
474

475
        # n is unused, so the API always returns only one choice
476
        # the argument is probably allowed for compatibility with OpenAI
477
        # see https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.n
478
        choice = api_chat_output.choices[0]
1✔
479

480
        text = choice.message.content
1✔
481

482
        tool_calls = _convert_hfapi_tool_calls(choice.message.tool_calls)
1✔
483

484
        meta: Dict[str, Any] = {
1✔
485
            "model": self._client.model,
486
            "finish_reason": choice.finish_reason,
487
            "index": choice.index,
488
        }
489

490
        usage = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
491
        if api_chat_output.usage:
1✔
492
            usage = {
1✔
493
                "prompt_tokens": api_chat_output.usage.prompt_tokens,
494
                "completion_tokens": api_chat_output.usage.completion_tokens,
495
            }
496
        meta["usage"] = usage
1✔
497

498
        message = ChatMessage.from_assistant(text=text, tool_calls=tool_calls, meta=meta)
1✔
499
        return {"replies": [message]}
1✔
500

501
    async def _run_streaming_async(
1✔
502
        self, messages: List[Dict[str, str]], generation_kwargs: Dict[str, Any], streaming_callback: StreamingCallbackT
503
    ):
504
        api_output: AsyncIterable[ChatCompletionStreamOutput] = await self._async_client.chat_completion(
1✔
505
            messages,
506
            stream=True,
507
            stream_options=ChatCompletionInputStreamOptions(include_usage=True),
508
            **generation_kwargs,
509
        )
510

511
        generated_text = ""
1✔
512
        first_chunk_time = None
1✔
513
        finish_reason = None
1✔
514
        usage = None
1✔
515
        meta: Dict[str, Any] = {}
1✔
516

517
        component_name = self.__component_name__ if hasattr(self, "__component_name__") else None
1✔
518
        component_type = self.__class__.__module__ + "." + self.__class__.__name__
1✔
519
        component_info = ComponentInfo(name=component_name, type=component_type)
1✔
520

521
        async for chunk in api_output:
1✔
522
            # The chunk with usage returns an empty array for choices
523
            if len(chunk.choices) > 0:
1✔
524
                # n is unused, so the API always returns only one choice
525
                # the argument is probably allowed for compatibility with OpenAI
526
                # see https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.n
527
                choice = chunk.choices[0]
1✔
528

529
                text = choice.delta.content or ""
1✔
530
                generated_text += text
1✔
531

532
                stream_chunk = StreamingChunk(text, meta, component_info)
1✔
533
                await streaming_callback(stream_chunk)  # type: ignore
1✔
534

535
            if chunk.usage:
1✔
536
                usage = chunk.usage
×
537

538
            if first_chunk_time is None:
1✔
539
                first_chunk_time = datetime.now().isoformat()
1✔
540

541
        if usage:
1✔
542
            usage_dict = {"prompt_tokens": usage.prompt_tokens, "completion_tokens": usage.completion_tokens}
×
543
        else:
544
            usage_dict = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
545

546
        meta.update(
1✔
547
            {
548
                "model": self._async_client.model,
549
                "index": 0,
550
                "finish_reason": finish_reason,
551
                "usage": usage_dict,
552
                "completion_start_time": first_chunk_time,
553
            }
554
        )
555

556
        message = ChatMessage.from_assistant(text=generated_text, meta=meta)
1✔
557
        return {"replies": [message]}
1✔
558

559
    async def _run_non_streaming_async(
1✔
560
        self,
561
        messages: List[Dict[str, str]],
562
        generation_kwargs: Dict[str, Any],
563
        tools: Optional[List["ChatCompletionInputTool"]] = None,
564
    ) -> Dict[str, List[ChatMessage]]:
565
        api_chat_output: ChatCompletionOutput = await self._async_client.chat_completion(
1✔
566
            messages=messages, tools=tools, **generation_kwargs
567
        )
568

569
        if len(api_chat_output.choices) == 0:
1✔
570
            return {"replies": []}
×
571

572
        choice = api_chat_output.choices[0]
1✔
573

574
        text = choice.message.content
1✔
575

576
        tool_calls = _convert_hfapi_tool_calls(choice.message.tool_calls)
1✔
577

578
        meta: Dict[str, Any] = {
1✔
579
            "model": self._async_client.model,
580
            "finish_reason": choice.finish_reason,
581
            "index": choice.index,
582
        }
583

584
        usage = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
585
        if api_chat_output.usage:
1✔
586
            usage = {
1✔
587
                "prompt_tokens": api_chat_output.usage.prompt_tokens,
588
                "completion_tokens": api_chat_output.usage.completion_tokens,
589
            }
590
        meta["usage"] = usage
1✔
591

592
        message = ChatMessage.from_assistant(text=text, tool_calls=tool_calls, meta=meta)
1✔
593
        return {"replies": [message]}
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc