• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 15191527043

22 May 2025 04:08PM UTC coverage: 90.345% (-0.07%) from 90.411%
15191527043

Pull #9426

github

web-flow
Merge 212e60881 into 4a5e4d3e6
Pull Request #9426: feat: add component name and type to `StreamingChunk`

11173 of 12367 relevant lines covered (90.35%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.31
haystack/components/generators/chat/hugging_face_api.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
import json
1✔
6
from datetime import datetime
1✔
7
from typing import Any, AsyncIterable, Dict, Iterable, List, Optional, Union
1✔
8

9
from haystack import component, default_from_dict, default_to_dict, logging
1✔
10
from haystack.dataclasses import ChatMessage, ComponentInfo, StreamingChunk, ToolCall, select_streaming_callback
1✔
11
from haystack.dataclasses.streaming_chunk import StreamingCallbackT
1✔
12
from haystack.lazy_imports import LazyImport
1✔
13
from haystack.tools import (
1✔
14
    Tool,
15
    Toolset,
16
    _check_duplicate_tool_names,
17
    deserialize_tools_or_toolset_inplace,
18
    serialize_tools_or_toolset,
19
)
20
from haystack.utils import Secret, deserialize_callable, deserialize_secrets_inplace, serialize_callable
1✔
21
from haystack.utils.hf import HFGenerationAPIType, HFModelType, check_valid_model, convert_message_to_hf_format
1✔
22
from haystack.utils.url_validation import is_valid_http_url
1✔
23

24
logger = logging.getLogger(__name__)
1✔
25

26
with LazyImport(message="Run 'pip install \"huggingface_hub[inference]>=0.27.0\"'") as huggingface_hub_import:
1✔
27
    from huggingface_hub import (
1✔
28
        AsyncInferenceClient,
29
        ChatCompletionInputFunctionDefinition,
30
        ChatCompletionInputStreamOptions,
31
        ChatCompletionInputTool,
32
        ChatCompletionOutput,
33
        ChatCompletionOutputToolCall,
34
        ChatCompletionStreamOutput,
35
        InferenceClient,
36
    )
37

38

39
def _convert_hfapi_tool_calls(hfapi_tool_calls: Optional[List["ChatCompletionOutputToolCall"]]) -> List[ToolCall]:
1✔
40
    """
41
    Convert HuggingFace API tool calls to a list of Haystack ToolCall.
42

43
    :param hfapi_tool_calls: The HuggingFace API tool calls to convert.
44
    :returns: A list of ToolCall objects.
45

46
    """
47
    if not hfapi_tool_calls:
1✔
48
        return []
1✔
49

50
    tool_calls = []
1✔
51

52
    for hfapi_tc in hfapi_tool_calls:
1✔
53
        hf_arguments = hfapi_tc.function.arguments
1✔
54

55
        arguments = None
1✔
56
        if isinstance(hf_arguments, dict):
1✔
57
            arguments = hf_arguments
1✔
58
        elif isinstance(hf_arguments, str):
1✔
59
            try:
1✔
60
                arguments = json.loads(hf_arguments)
1✔
61
            except json.JSONDecodeError:
1✔
62
                logger.warning(
1✔
63
                    "HuggingFace API returned a malformed JSON string for tool call arguments. This tool call "
64
                    "will be skipped. Tool call ID: {_id}, Tool name: {_name}, Arguments: {_arguments}",
65
                    _id=hfapi_tc.id,
66
                    _name=hfapi_tc.function.name,
67
                    _arguments=hf_arguments,
68
                )
69
        else:
70
            logger.warning(
1✔
71
                "HuggingFace API returned tool call arguments of type {_type}. Valid types are dict and str. This tool "
72
                "call will be skipped. Tool call ID: {_id}, Tool name: {_name}, Arguments: {_arguments}",
73
                _id=hfapi_tc.id,
74
                _name=hfapi_tc.function.name,
75
                _arguments=hf_arguments,
76
            )
77

78
        if arguments:
1✔
79
            tool_calls.append(ToolCall(tool_name=hfapi_tc.function.name, arguments=arguments, id=hfapi_tc.id))
1✔
80

81
    return tool_calls
1✔
82

83

84
def _convert_tools_to_hfapi_tools(
1✔
85
    tools: Optional[Union[List[Tool], Toolset]],
86
) -> Optional[List["ChatCompletionInputTool"]]:
87
    if not tools:
1✔
88
        return None
1✔
89

90
    # huggingface_hub<0.31.0 uses "arguments", huggingface_hub>=0.31.0 uses "parameters"
91
    parameters_name = "arguments" if hasattr(ChatCompletionInputFunctionDefinition, "arguments") else "parameters"
1✔
92

93
    hf_tools = []
1✔
94
    for tool in tools:
1✔
95
        hf_tools_args = {"name": tool.name, "description": tool.description, parameters_name: tool.parameters}
1✔
96

97
        hf_tools.append(
1✔
98
            ChatCompletionInputTool(function=ChatCompletionInputFunctionDefinition(**hf_tools_args), type="function")
99
        )
100

101
    return hf_tools
1✔
102

103

104
@component
1✔
105
class HuggingFaceAPIChatGenerator:
1✔
106
    """
107
    Completes chats using Hugging Face APIs.
108

109
    HuggingFaceAPIChatGenerator uses the [ChatMessage](https://docs.haystack.deepset.ai/docs/chatmessage)
110
    format for input and output. Use it to generate text with Hugging Face APIs:
111
    - [Free Serverless Inference API](https://huggingface.co/inference-api)
112
    - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints)
113
    - [Self-hosted Text Generation Inference](https://github.com/huggingface/text-generation-inference)
114

115
    ### Usage examples
116

117
    #### With the free serverless inference API
118

119
    ```python
120
    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
121
    from haystack.dataclasses import ChatMessage
122
    from haystack.utils import Secret
123
    from haystack.utils.hf import HFGenerationAPIType
124

125
    messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"),
126
                ChatMessage.from_user("What's Natural Language Processing?")]
127

128
    # the api_type can be expressed using the HFGenerationAPIType enum or as a string
129
    api_type = HFGenerationAPIType.SERVERLESS_INFERENCE_API
130
    api_type = "serverless_inference_api" # this is equivalent to the above
131

132
    generator = HuggingFaceAPIChatGenerator(api_type=api_type,
133
                                            api_params={"model": "HuggingFaceH4/zephyr-7b-beta"},
134
                                            token=Secret.from_token("<your-api-key>"))
135

136
    result = generator.run(messages)
137
    print(result)
138
    ```
139

140
    #### With paid inference endpoints
141

142
    ```python
143
    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
144
    from haystack.dataclasses import ChatMessage
145
    from haystack.utils import Secret
146

147
    messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"),
148
                ChatMessage.from_user("What's Natural Language Processing?")]
149

150
    generator = HuggingFaceAPIChatGenerator(api_type="inference_endpoints",
151
                                            api_params={"url": "<your-inference-endpoint-url>"},
152
                                            token=Secret.from_token("<your-api-key>"))
153

154
    result = generator.run(messages)
155
    print(result)
156

157
    #### With self-hosted text generation inference
158

159
    ```python
160
    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
161
    from haystack.dataclasses import ChatMessage
162

163
    messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"),
164
                ChatMessage.from_user("What's Natural Language Processing?")]
165

166
    generator = HuggingFaceAPIChatGenerator(api_type="text_generation_inference",
167
                                            api_params={"url": "http://localhost:8080"})
168

169
    result = generator.run(messages)
170
    print(result)
171
    ```
172
    """
173

174
    # Type annotation for the component name
175
    __component_name__: str
1✔
176

177
    def __init__(  # pylint: disable=too-many-positional-arguments
1✔
178
        self,
179
        api_type: Union[HFGenerationAPIType, str],
180
        api_params: Dict[str, str],
181
        token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
182
        generation_kwargs: Optional[Dict[str, Any]] = None,
183
        stop_words: Optional[List[str]] = None,
184
        streaming_callback: Optional[StreamingCallbackT] = None,
185
        tools: Optional[Union[List[Tool], Toolset]] = None,
186
    ):
187
        """
188
        Initialize the HuggingFaceAPIChatGenerator instance.
189

190
        :param api_type:
191
            The type of Hugging Face API to use. Available types:
192
            - `text_generation_inference`: See [TGI](https://github.com/huggingface/text-generation-inference).
193
            - `inference_endpoints`: See [Inference Endpoints](https://huggingface.co/inference-endpoints).
194
            - `serverless_inference_api`: See [Serverless Inference API](https://huggingface.co/inference-api).
195
        :param api_params:
196
            A dictionary with the following keys:
197
            - `model`: Hugging Face model ID. Required when `api_type` is `SERVERLESS_INFERENCE_API`.
198
            - `url`: URL of the inference endpoint. Required when `api_type` is `INFERENCE_ENDPOINTS` or
199
            `TEXT_GENERATION_INFERENCE`.
200
        :param token:
201
            The Hugging Face token to use as HTTP bearer authorization.
202
            Check your HF token in your [account settings](https://huggingface.co/settings/tokens).
203
        :param generation_kwargs:
204
            A dictionary with keyword arguments to customize text generation.
205
                Some examples: `max_tokens`, `temperature`, `top_p`.
206
                For details, see [Hugging Face chat_completion documentation](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion).
207
        :param stop_words:
208
            An optional list of strings representing the stop words.
209
        :param streaming_callback:
210
            An optional callable for handling streaming responses.
211
        :param tools:
212
            A list of tools or a Toolset for which the model can prepare calls.
213
            The chosen model should support tool/function calling, according to the model card.
214
            Support for tools in the Hugging Face API and TGI is not yet fully refined and you may experience
215
            unexpected behavior. This parameter can accept either a list of `Tool` objects or a `Toolset` instance.
216
        """
217

218
        huggingface_hub_import.check()
1✔
219

220
        if isinstance(api_type, str):
1✔
221
            api_type = HFGenerationAPIType.from_str(api_type)
1✔
222

223
        if api_type == HFGenerationAPIType.SERVERLESS_INFERENCE_API:
1✔
224
            model = api_params.get("model")
1✔
225
            if model is None:
1✔
226
                raise ValueError(
1✔
227
                    "To use the Serverless Inference API, you need to specify the `model` parameter in `api_params`."
228
                )
229
            check_valid_model(model, HFModelType.GENERATION, token)
1✔
230
            model_or_url = model
1✔
231
        elif api_type in [HFGenerationAPIType.INFERENCE_ENDPOINTS, HFGenerationAPIType.TEXT_GENERATION_INFERENCE]:
1✔
232
            url = api_params.get("url")
1✔
233
            if url is None:
1✔
234
                msg = (
1✔
235
                    "To use Text Generation Inference or Inference Endpoints, you need to specify the `url` parameter "
236
                    "in `api_params`."
237
                )
238
                raise ValueError(msg)
1✔
239
            if not is_valid_http_url(url):
1✔
240
                raise ValueError(f"Invalid URL: {url}")
1✔
241
            model_or_url = url
1✔
242
        else:
243
            msg = f"Unknown api_type {api_type}"
×
244
            raise ValueError(msg)
×
245

246
        if tools and streaming_callback is not None:
1✔
247
            raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.")
1✔
248
        _check_duplicate_tool_names(list(tools or []))
1✔
249

250
        # handle generation kwargs setup
251
        generation_kwargs = generation_kwargs.copy() if generation_kwargs else {}
1✔
252
        generation_kwargs["stop"] = generation_kwargs.get("stop", [])
1✔
253
        generation_kwargs["stop"].extend(stop_words or [])
1✔
254
        generation_kwargs.setdefault("max_tokens", 512)
1✔
255

256
        self.api_type = api_type
1✔
257
        self.api_params = api_params
1✔
258
        self.token = token
1✔
259
        self.generation_kwargs = generation_kwargs
1✔
260
        self.streaming_callback = streaming_callback
1✔
261
        self._client = InferenceClient(model_or_url, token=token.resolve_value() if token else None)
1✔
262
        self._async_client = AsyncInferenceClient(model_or_url, token=token.resolve_value() if token else None)
1✔
263
        self.tools = tools
1✔
264

265
    def to_dict(self) -> Dict[str, Any]:
1✔
266
        """
267
        Serialize this component to a dictionary.
268

269
        :returns:
270
            A dictionary containing the serialized component.
271
        """
272
        callback_name = serialize_callable(self.streaming_callback) if self.streaming_callback else None
1✔
273
        return default_to_dict(
1✔
274
            self,
275
            api_type=str(self.api_type),
276
            api_params=self.api_params,
277
            token=self.token.to_dict() if self.token else None,
278
            generation_kwargs=self.generation_kwargs,
279
            streaming_callback=callback_name,
280
            tools=serialize_tools_or_toolset(self.tools),
281
        )
282

283
    @classmethod
1✔
284
    def from_dict(cls, data: Dict[str, Any]) -> "HuggingFaceAPIChatGenerator":
1✔
285
        """
286
        Deserialize this component from a dictionary.
287
        """
288
        deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
1✔
289
        deserialize_tools_or_toolset_inplace(data["init_parameters"], key="tools")
1✔
290
        init_params = data.get("init_parameters", {})
1✔
291
        serialized_callback_handler = init_params.get("streaming_callback")
1✔
292
        if serialized_callback_handler:
1✔
293
            data["init_parameters"]["streaming_callback"] = deserialize_callable(serialized_callback_handler)
×
294
        return default_from_dict(cls, data)
1✔
295

296
    @component.output_types(replies=List[ChatMessage])
1✔
297
    def run(
1✔
298
        self,
299
        messages: List[ChatMessage],
300
        generation_kwargs: Optional[Dict[str, Any]] = None,
301
        tools: Optional[Union[List[Tool], Toolset]] = None,
302
        streaming_callback: Optional[StreamingCallbackT] = None,
303
    ):
304
        """
305
        Invoke the text generation inference based on the provided messages and generation parameters.
306

307
        :param messages:
308
            A list of ChatMessage objects representing the input messages.
309
        :param generation_kwargs:
310
            Additional keyword arguments for text generation.
311
        :param tools:
312
            A list of tools or a Toolset for which the model can prepare calls. If set, it will override
313
            the `tools` parameter set during component initialization. This parameter can accept either a
314
            list of `Tool` objects or a `Toolset` instance.
315
        :param streaming_callback:
316
            An optional callable for handling streaming responses. If set, it will override the `streaming_callback`
317
            parameter set during component initialization.
318
        :returns: A dictionary with the following keys:
319
            - `replies`: A list containing the generated responses as ChatMessage objects.
320
        """
321

322
        # update generation kwargs by merging with the default ones
323
        generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
1✔
324

325
        formatted_messages = [convert_message_to_hf_format(message) for message in messages]
1✔
326

327
        tools = tools or self.tools
1✔
328
        if tools and self.streaming_callback:
1✔
329
            raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.")
1✔
330
        _check_duplicate_tool_names(list(tools or []))
1✔
331

332
        # validate and select the streaming callback
333
        streaming_callback = select_streaming_callback(
1✔
334
            self.streaming_callback, streaming_callback, requires_async=False
335
        )
336

337
        if streaming_callback:
1✔
338
            return self._run_streaming(formatted_messages, generation_kwargs, streaming_callback)
1✔
339

340
        if tools and isinstance(tools, Toolset):
1✔
341
            tools = list(tools)
×
342

343
        hf_tools = _convert_tools_to_hfapi_tools(tools)
1✔
344

345
        return self._run_non_streaming(formatted_messages, generation_kwargs, hf_tools)
1✔
346

347
    @component.output_types(replies=List[ChatMessage])
1✔
348
    async def run_async(
1✔
349
        self,
350
        messages: List[ChatMessage],
351
        generation_kwargs: Optional[Dict[str, Any]] = None,
352
        tools: Optional[Union[List[Tool], Toolset]] = None,
353
        streaming_callback: Optional[StreamingCallbackT] = None,
354
    ):
355
        """
356
        Asynchronously invokes the text generation inference based on the provided messages and generation parameters.
357

358
        This is the asynchronous version of the `run` method. It has the same parameters
359
        and return values but can be used with `await` in an async code.
360

361
        :param messages:
362
            A list of ChatMessage objects representing the input messages.
363
        :param generation_kwargs:
364
            Additional keyword arguments for text generation.
365
        :param tools:
366
            A list of tools or a Toolset for which the model can prepare calls. If set, it will override the `tools`
367
            parameter set during component initialization. This parameter can accept either a list of `Tool` objects
368
            or a `Toolset` instance.
369
        :param streaming_callback:
370
            An optional callable for handling streaming responses. If set, it will override the `streaming_callback`
371
            parameter set during component initialization.
372
        :returns: A dictionary with the following keys:
373
            - `replies`: A list containing the generated responses as ChatMessage objects.
374
        """
375

376
        # update generation kwargs by merging with the default ones
377
        generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
1✔
378

379
        formatted_messages = [convert_message_to_hf_format(message) for message in messages]
1✔
380

381
        tools = tools or self.tools
1✔
382
        if tools and self.streaming_callback:
1✔
383
            raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.")
×
384
        _check_duplicate_tool_names(list(tools or []))
1✔
385

386
        # validate and select the streaming callback
387
        streaming_callback = select_streaming_callback(self.streaming_callback, streaming_callback, requires_async=True)
1✔
388

389
        if streaming_callback:
1✔
390
            return await self._run_streaming_async(formatted_messages, generation_kwargs, streaming_callback)
1✔
391

392
        if tools and isinstance(tools, Toolset):
1✔
393
            tools = list(tools)
×
394

395
        hf_tools = _convert_tools_to_hfapi_tools(tools)
1✔
396

397
        return await self._run_non_streaming_async(formatted_messages, generation_kwargs, hf_tools)
1✔
398

399
    def _run_streaming(
1✔
400
        self, messages: List[Dict[str, str]], generation_kwargs: Dict[str, Any], streaming_callback: StreamingCallbackT
401
    ):
402
        api_output: Iterable[ChatCompletionStreamOutput] = self._client.chat_completion(
1✔
403
            messages,
404
            stream=True,
405
            stream_options=ChatCompletionInputStreamOptions(include_usage=True),
406
            **generation_kwargs,
407
        )
408

409
        generated_text = ""
1✔
410
        first_chunk_time = None
1✔
411
        finish_reason = None
1✔
412
        usage = None
1✔
413
        meta: Dict[str, Any] = {}
1✔
414

415
        # get component name and type
416
        component_name = self.__component_name__ if hasattr(self, "__component_name__") else None
1✔
417
        component_type = self.__class__.__module__ + "." + self.__class__.__name__
1✔
418
        component_info = ComponentInfo(name=component_name, type=component_type)
1✔
419

420
        # Set up streaming handler
421
        for chunk in api_output:
1✔
422
            # The chunk with usage returns an empty array for choices
423
            if len(chunk.choices) > 0:
1✔
424
                # n is unused, so the API always returns only one choice
425
                # the argument is probably allowed for compatibility with OpenAI
426
                # see https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.n
427
                choice = chunk.choices[0]
1✔
428

429
                text = choice.delta.content or ""
1✔
430
                generated_text += text
1✔
431

432
                if choice.finish_reason:
1✔
433
                    finish_reason = choice.finish_reason
1✔
434
                    meta["finish_reason"] = finish_reason
1✔
435

436
                meta["model"] = self._client.model
1✔
437
                stream_chunk = StreamingChunk(text, meta, component_info)
1✔
438
                streaming_callback(stream_chunk)
1✔
439

440
            if chunk.usage:
1✔
441
                usage = chunk.usage
×
442

443
            if first_chunk_time is None:
1✔
444
                first_chunk_time = datetime.now().isoformat()
1✔
445

446
        if usage:
1✔
447
            usage_dict = {"prompt_tokens": usage.prompt_tokens, "completion_tokens": usage.completion_tokens}
×
448
        else:
449
            usage_dict = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
450

451
        meta.update(
1✔
452
            {
453
                "model": self._client.model,
454
                "index": 0,
455
                "finish_reason": finish_reason,
456
                "usage": usage_dict,
457
                "completion_start_time": first_chunk_time,
458
            }
459
        )
460

461
        message = ChatMessage.from_assistant(text=generated_text, meta=meta)
1✔
462
        return {"replies": [message]}
1✔
463

464
    def _run_non_streaming(
1✔
465
        self,
466
        messages: List[Dict[str, str]],
467
        generation_kwargs: Dict[str, Any],
468
        tools: Optional[List["ChatCompletionInputTool"]] = None,
469
    ) -> Dict[str, List[ChatMessage]]:
470
        api_chat_output: ChatCompletionOutput = self._client.chat_completion(
1✔
471
            messages=messages, tools=tools, **generation_kwargs
472
        )
473

474
        if len(api_chat_output.choices) == 0:
1✔
475
            return {"replies": []}
×
476

477
        # n is unused, so the API always returns only one choice
478
        # the argument is probably allowed for compatibility with OpenAI
479
        # see https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.n
480
        choice = api_chat_output.choices[0]
1✔
481

482
        text = choice.message.content
1✔
483

484
        tool_calls = _convert_hfapi_tool_calls(choice.message.tool_calls)
1✔
485

486
        meta: Dict[str, Any] = {
1✔
487
            "model": self._client.model,
488
            "finish_reason": choice.finish_reason,
489
            "index": choice.index,
490
        }
491

492
        usage = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
493
        if api_chat_output.usage:
1✔
494
            usage = {
1✔
495
                "prompt_tokens": api_chat_output.usage.prompt_tokens,
496
                "completion_tokens": api_chat_output.usage.completion_tokens,
497
            }
498
        meta["usage"] = usage
1✔
499

500
        message = ChatMessage.from_assistant(text=text, tool_calls=tool_calls, meta=meta)
1✔
501
        return {"replies": [message]}
1✔
502

503
    async def _run_streaming_async(
1✔
504
        self, messages: List[Dict[str, str]], generation_kwargs: Dict[str, Any], streaming_callback: StreamingCallbackT
505
    ):
506
        api_output: AsyncIterable[ChatCompletionStreamOutput] = await self._async_client.chat_completion(
1✔
507
            messages,
508
            stream=True,
509
            stream_options=ChatCompletionInputStreamOptions(include_usage=True),
510
            **generation_kwargs,
511
        )
512

513
        generated_text = ""
1✔
514
        first_chunk_time = None
1✔
515
        finish_reason = None
1✔
516
        usage = None
1✔
517
        meta: Dict[str, Any] = {}
1✔
518

519
        component_name = self.__component_name__ if hasattr(self, "__component_name__") else None
1✔
520
        component_type = self.__class__.__module__ + "." + self.__class__.__name__
1✔
521
        component_info = ComponentInfo(name=component_name, type=component_type)
1✔
522

523
        async for chunk in api_output:
1✔
524
            # The chunk with usage returns an empty array for choices
525
            if len(chunk.choices) > 0:
1✔
526
                # n is unused, so the API always returns only one choice
527
                # the argument is probably allowed for compatibility with OpenAI
528
                # see https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.n
529
                choice = chunk.choices[0]
1✔
530

531
                text = choice.delta.content or ""
1✔
532
                generated_text += text
1✔
533

534
                if choice.finish_reason:
1✔
535
                    finish_reason = choice.finish_reason
1✔
536
                    meta["finish_reason"] = finish_reason
1✔
537

538
                meta["model"] = self._async_client.model
1✔
539

540
                stream_chunk = StreamingChunk(text, meta, component_info)
1✔
541
                await streaming_callback(stream_chunk)  # type: ignore
1✔
542

543
            if chunk.usage:
1✔
544
                usage = chunk.usage
×
545

546
            if first_chunk_time is None:
1✔
547
                first_chunk_time = datetime.now().isoformat()
1✔
548

549
        if usage:
1✔
550
            usage_dict = {"prompt_tokens": usage.prompt_tokens, "completion_tokens": usage.completion_tokens}
×
551
        else:
552
            usage_dict = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
553

554
        meta.update(
1✔
555
            {
556
                "model": self._async_client.model,
557
                "index": 0,
558
                "finish_reason": finish_reason,
559
                "usage": usage_dict,
560
                "completion_start_time": first_chunk_time,
561
            }
562
        )
563

564
        message = ChatMessage.from_assistant(text=generated_text, meta=meta)
1✔
565
        return {"replies": [message]}
1✔
566

567
    async def _run_non_streaming_async(
1✔
568
        self,
569
        messages: List[Dict[str, str]],
570
        generation_kwargs: Dict[str, Any],
571
        tools: Optional[List["ChatCompletionInputTool"]] = None,
572
    ) -> Dict[str, List[ChatMessage]]:
573
        api_chat_output: ChatCompletionOutput = await self._async_client.chat_completion(
1✔
574
            messages=messages, tools=tools, **generation_kwargs
575
        )
576

577
        if len(api_chat_output.choices) == 0:
1✔
578
            return {"replies": []}
×
579

580
        choice = api_chat_output.choices[0]
1✔
581

582
        text = choice.message.content
1✔
583

584
        tool_calls = _convert_hfapi_tool_calls(choice.message.tool_calls)
1✔
585

586
        meta: Dict[str, Any] = {
1✔
587
            "model": self._async_client.model,
588
            "finish_reason": choice.finish_reason,
589
            "index": choice.index,
590
        }
591

592
        usage = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
593
        if api_chat_output.usage:
1✔
594
            usage = {
1✔
595
                "prompt_tokens": api_chat_output.usage.prompt_tokens,
596
                "completion_tokens": api_chat_output.usage.completion_tokens,
597
            }
598
        meta["usage"] = usage
1✔
599

600
        message = ChatMessage.from_assistant(text=text, tool_calls=tool_calls, meta=meta)
1✔
601
        return {"replies": [message]}
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc