• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 15825051859

23 Jun 2025 01:05PM UTC coverage: 90.176% (-0.005%) from 90.181%
15825051859

Pull #9536

github

web-flow
Merge 0408d779d into 556dcc9e4
Pull Request #9536: feat: Add `finish_reason` field to `StreamingChunk`

11575 of 12836 relevant lines covered (90.18%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.6
haystack/components/generators/chat/hugging_face_api.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
import json
1✔
6
from datetime import datetime
1✔
7
from typing import Any, AsyncIterable, Dict, Iterable, List, Optional, Union
1✔
8

9
from haystack import component, default_from_dict, default_to_dict, logging
1✔
10
from haystack.components.generators.utils import _convert_streaming_chunks_to_chat_message
1✔
11
from haystack.dataclasses import (
1✔
12
    AsyncStreamingCallbackT,
13
    ChatMessage,
14
    ComponentInfo,
15
    StreamingCallbackT,
16
    StreamingChunk,
17
    SyncStreamingCallbackT,
18
    ToolCall,
19
    select_streaming_callback,
20
)
21
from haystack.lazy_imports import LazyImport
1✔
22
from haystack.tools import (
1✔
23
    Tool,
24
    Toolset,
25
    _check_duplicate_tool_names,
26
    deserialize_tools_or_toolset_inplace,
27
    serialize_tools_or_toolset,
28
)
29
from haystack.utils import Secret, deserialize_callable, deserialize_secrets_inplace, serialize_callable
1✔
30
from haystack.utils.hf import HFGenerationAPIType, HFModelType, check_valid_model, convert_message_to_hf_format
1✔
31
from haystack.utils.url_validation import is_valid_http_url
1✔
32

33
logger = logging.getLogger(__name__)
1✔
34

35
with LazyImport(message="Run 'pip install \"huggingface_hub[inference]>=0.27.0\"'") as huggingface_hub_import:
1✔
36
    from huggingface_hub import (
1✔
37
        AsyncInferenceClient,
38
        ChatCompletionInputFunctionDefinition,
39
        ChatCompletionInputStreamOptions,
40
        ChatCompletionInputTool,
41
        ChatCompletionOutput,
42
        ChatCompletionOutputToolCall,
43
        ChatCompletionStreamOutput,
44
        InferenceClient,
45
    )
46

47

48
def _convert_hfapi_tool_calls(hfapi_tool_calls: Optional[List["ChatCompletionOutputToolCall"]]) -> List[ToolCall]:
1✔
49
    """
50
    Convert HuggingFace API tool calls to a list of Haystack ToolCall.
51

52
    :param hfapi_tool_calls: The HuggingFace API tool calls to convert.
53
    :returns: A list of ToolCall objects.
54

55
    """
56
    if not hfapi_tool_calls:
1✔
57
        return []
1✔
58

59
    tool_calls = []
1✔
60

61
    for hfapi_tc in hfapi_tool_calls:
1✔
62
        hf_arguments = hfapi_tc.function.arguments
1✔
63

64
        arguments = None
1✔
65
        if isinstance(hf_arguments, dict):
1✔
66
            arguments = hf_arguments
1✔
67
        elif isinstance(hf_arguments, str):
1✔
68
            try:
1✔
69
                arguments = json.loads(hf_arguments)
1✔
70
            except json.JSONDecodeError:
1✔
71
                logger.warning(
1✔
72
                    "HuggingFace API returned a malformed JSON string for tool call arguments. This tool call "
73
                    "will be skipped. Tool call ID: {_id}, Tool name: {_name}, Arguments: {_arguments}",
74
                    _id=hfapi_tc.id,
75
                    _name=hfapi_tc.function.name,
76
                    _arguments=hf_arguments,
77
                )
78
        else:
79
            logger.warning(
1✔
80
                "HuggingFace API returned tool call arguments of type {_type}. Valid types are dict and str. This tool "
81
                "call will be skipped. Tool call ID: {_id}, Tool name: {_name}, Arguments: {_arguments}",
82
                _id=hfapi_tc.id,
83
                _name=hfapi_tc.function.name,
84
                _arguments=hf_arguments,
85
            )
86

87
        if arguments:
1✔
88
            tool_calls.append(ToolCall(tool_name=hfapi_tc.function.name, arguments=arguments, id=hfapi_tc.id))
1✔
89

90
    return tool_calls
1✔
91

92

93
def _convert_tools_to_hfapi_tools(
1✔
94
    tools: Optional[Union[List[Tool], Toolset]],
95
) -> Optional[List["ChatCompletionInputTool"]]:
96
    if not tools:
1✔
97
        return None
1✔
98

99
    # huggingface_hub<0.31.0 uses "arguments", huggingface_hub>=0.31.0 uses "parameters"
100
    parameters_name = "arguments" if hasattr(ChatCompletionInputFunctionDefinition, "arguments") else "parameters"
1✔
101

102
    hf_tools = []
1✔
103
    for tool in tools:
1✔
104
        hf_tools_args = {"name": tool.name, "description": tool.description, parameters_name: tool.parameters}
1✔
105

106
        hf_tools.append(
1✔
107
            ChatCompletionInputTool(function=ChatCompletionInputFunctionDefinition(**hf_tools_args), type="function")
108
        )
109

110
    return hf_tools
1✔
111

112

113
def _convert_chat_completion_stream_output_to_streaming_chunk(
1✔
114
    chunk: "ChatCompletionStreamOutput",
115
    previous_chunks: List[StreamingChunk],
116
    component_info: Optional[ComponentInfo] = None,
117
) -> StreamingChunk:
118
    """
119
    Converts the Hugging Face API ChatCompletionStreamOutput to a StreamingChunk.
120
    """
121
    # Choices is empty if include_usage is set to True where the usage information is returned.
122
    if len(chunk.choices) == 0:
1✔
123
        usage = None
1✔
124
        if chunk.usage:
1✔
125
            usage = {"prompt_tokens": chunk.usage.prompt_tokens, "completion_tokens": chunk.usage.completion_tokens}
1✔
126
        return StreamingChunk(
1✔
127
            content="",
128
            meta={"model": chunk.model, "received_at": datetime.now().isoformat(), "usage": usage},
129
            component_info=component_info,
130
        )
131

132
    # n is unused, so the API always returns only one choice
133
    # the argument is probably allowed for compatibility with OpenAI
134
    # see https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.n
135
    choice = chunk.choices[0]
1✔
136
    stream_chunk = StreamingChunk(
1✔
137
        content=choice.delta.content or "",
138
        meta={"model": chunk.model, "received_at": datetime.now().isoformat(), "finish_reason": choice.finish_reason},
139
        component_info=component_info,
140
        # Index must always be 0 since we don't allow tool calls in streaming mode.
141
        index=0 if choice.finish_reason is None else None,
142
        # start is True at the very beginning since first chunk contains role information + first part of the answer.
143
        start=len(previous_chunks) == 0,
144
        # there is no way to constrain here as many models can return any finish reason
145
        # so we need to ignore the type error
146
        finish_reason=choice.finish_reason,  # type: ignore[arg-type]
147
    )
148
    return stream_chunk
1✔
149

150

151
@component
1✔
152
class HuggingFaceAPIChatGenerator:
1✔
153
    """
154
    Completes chats using Hugging Face APIs.
155

156
    HuggingFaceAPIChatGenerator uses the [ChatMessage](https://docs.haystack.deepset.ai/docs/chatmessage)
157
    format for input and output. Use it to generate text with Hugging Face APIs:
158
    - [Free Serverless Inference API](https://huggingface.co/inference-api)
159
    - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints)
160
    - [Self-hosted Text Generation Inference](https://github.com/huggingface/text-generation-inference)
161

162
    ### Usage examples
163

164
    #### With the free serverless inference API
165

166
    ```python
167
    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
168
    from haystack.dataclasses import ChatMessage
169
    from haystack.utils import Secret
170
    from haystack.utils.hf import HFGenerationAPIType
171

172
    messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"),
173
                ChatMessage.from_user("What's Natural Language Processing?")]
174

175
    # the api_type can be expressed using the HFGenerationAPIType enum or as a string
176
    api_type = HFGenerationAPIType.SERVERLESS_INFERENCE_API
177
    api_type = "serverless_inference_api" # this is equivalent to the above
178

179
    generator = HuggingFaceAPIChatGenerator(api_type=api_type,
180
                                            api_params={"model": "HuggingFaceH4/zephyr-7b-beta"},
181
                                            token=Secret.from_token("<your-api-key>"))
182

183
    result = generator.run(messages)
184
    print(result)
185
    ```
186

187
    #### With paid inference endpoints
188

189
    ```python
190
    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
191
    from haystack.dataclasses import ChatMessage
192
    from haystack.utils import Secret
193

194
    messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"),
195
                ChatMessage.from_user("What's Natural Language Processing?")]
196

197
    generator = HuggingFaceAPIChatGenerator(api_type="inference_endpoints",
198
                                            api_params={"url": "<your-inference-endpoint-url>"},
199
                                            token=Secret.from_token("<your-api-key>"))
200

201
    result = generator.run(messages)
202
    print(result)
203

204
    #### With self-hosted text generation inference
205

206
    ```python
207
    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
208
    from haystack.dataclasses import ChatMessage
209

210
    messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"),
211
                ChatMessage.from_user("What's Natural Language Processing?")]
212

213
    generator = HuggingFaceAPIChatGenerator(api_type="text_generation_inference",
214
                                            api_params={"url": "http://localhost:8080"})
215

216
    result = generator.run(messages)
217
    print(result)
218
    ```
219
    """
220

221
    def __init__(  # pylint: disable=too-many-positional-arguments
1✔
222
        self,
223
        api_type: Union[HFGenerationAPIType, str],
224
        api_params: Dict[str, str],
225
        token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
226
        generation_kwargs: Optional[Dict[str, Any]] = None,
227
        stop_words: Optional[List[str]] = None,
228
        streaming_callback: Optional[StreamingCallbackT] = None,
229
        tools: Optional[Union[List[Tool], Toolset]] = None,
230
    ):
231
        """
232
        Initialize the HuggingFaceAPIChatGenerator instance.
233

234
        :param api_type:
235
            The type of Hugging Face API to use. Available types:
236
            - `text_generation_inference`: See [TGI](https://github.com/huggingface/text-generation-inference).
237
            - `inference_endpoints`: See [Inference Endpoints](https://huggingface.co/inference-endpoints).
238
            - `serverless_inference_api`: See [Serverless Inference API](https://huggingface.co/inference-api).
239
        :param api_params:
240
            A dictionary with the following keys:
241
            - `model`: Hugging Face model ID. Required when `api_type` is `SERVERLESS_INFERENCE_API`.
242
            - `url`: URL of the inference endpoint. Required when `api_type` is `INFERENCE_ENDPOINTS` or
243
            `TEXT_GENERATION_INFERENCE`.
244
            - Other parameters specific to the chosen API type, such as `timeout`, `headers`, `provider` etc.
245
        :param token:
246
            The Hugging Face token to use as HTTP bearer authorization.
247
            Check your HF token in your [account settings](https://huggingface.co/settings/tokens).
248
        :param generation_kwargs:
249
            A dictionary with keyword arguments to customize text generation.
250
                Some examples: `max_tokens`, `temperature`, `top_p`.
251
                For details, see [Hugging Face chat_completion documentation](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion).
252
        :param stop_words:
253
            An optional list of strings representing the stop words.
254
        :param streaming_callback:
255
            An optional callable for handling streaming responses.
256
        :param tools:
257
            A list of tools or a Toolset for which the model can prepare calls.
258
            The chosen model should support tool/function calling, according to the model card.
259
            Support for tools in the Hugging Face API and TGI is not yet fully refined and you may experience
260
            unexpected behavior. This parameter can accept either a list of `Tool` objects or a `Toolset` instance.
261
        """
262

263
        huggingface_hub_import.check()
1✔
264

265
        if isinstance(api_type, str):
1✔
266
            api_type = HFGenerationAPIType.from_str(api_type)
1✔
267

268
        if api_type == HFGenerationAPIType.SERVERLESS_INFERENCE_API:
1✔
269
            model = api_params.get("model")
1✔
270
            if model is None:
1✔
271
                raise ValueError(
1✔
272
                    "To use the Serverless Inference API, you need to specify the `model` parameter in `api_params`."
273
                )
274
            check_valid_model(model, HFModelType.GENERATION, token)
1✔
275
            model_or_url = model
1✔
276
        elif api_type in [HFGenerationAPIType.INFERENCE_ENDPOINTS, HFGenerationAPIType.TEXT_GENERATION_INFERENCE]:
1✔
277
            url = api_params.get("url")
1✔
278
            if url is None:
1✔
279
                msg = (
1✔
280
                    "To use Text Generation Inference or Inference Endpoints, you need to specify the `url` parameter "
281
                    "in `api_params`."
282
                )
283
                raise ValueError(msg)
1✔
284
            if not is_valid_http_url(url):
1✔
285
                raise ValueError(f"Invalid URL: {url}")
1✔
286
            model_or_url = url
1✔
287
        else:
288
            msg = f"Unknown api_type {api_type}"
×
289
            raise ValueError(msg)
×
290

291
        if tools and streaming_callback is not None:
1✔
292
            raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.")
1✔
293
        _check_duplicate_tool_names(list(tools or []))
1✔
294

295
        # handle generation kwargs setup
296
        generation_kwargs = generation_kwargs.copy() if generation_kwargs else {}
1✔
297
        generation_kwargs["stop"] = generation_kwargs.get("stop", [])
1✔
298
        generation_kwargs["stop"].extend(stop_words or [])
1✔
299
        generation_kwargs.setdefault("max_tokens", 512)
1✔
300

301
        self.api_type = api_type
1✔
302
        self.api_params = api_params
1✔
303
        self.token = token
1✔
304
        self.generation_kwargs = generation_kwargs
1✔
305
        self.streaming_callback = streaming_callback
1✔
306

307
        resolved_api_params: Dict[str, Any] = {k: v for k, v in api_params.items() if k != "model" and k != "url"}
1✔
308
        self._client = InferenceClient(
1✔
309
            model_or_url, token=token.resolve_value() if token else None, **resolved_api_params
310
        )
311
        self._async_client = AsyncInferenceClient(
1✔
312
            model_or_url, token=token.resolve_value() if token else None, **resolved_api_params
313
        )
314
        self.tools = tools
1✔
315

316
    def to_dict(self) -> Dict[str, Any]:
1✔
317
        """
318
        Serialize this component to a dictionary.
319

320
        :returns:
321
            A dictionary containing the serialized component.
322
        """
323
        callback_name = serialize_callable(self.streaming_callback) if self.streaming_callback else None
1✔
324
        return default_to_dict(
1✔
325
            self,
326
            api_type=str(self.api_type),
327
            api_params=self.api_params,
328
            token=self.token.to_dict() if self.token else None,
329
            generation_kwargs=self.generation_kwargs,
330
            streaming_callback=callback_name,
331
            tools=serialize_tools_or_toolset(self.tools),
332
        )
333

334
    @classmethod
1✔
335
    def from_dict(cls, data: Dict[str, Any]) -> "HuggingFaceAPIChatGenerator":
1✔
336
        """
337
        Deserialize this component from a dictionary.
338
        """
339
        deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
1✔
340
        deserialize_tools_or_toolset_inplace(data["init_parameters"], key="tools")
1✔
341
        init_params = data.get("init_parameters", {})
1✔
342
        serialized_callback_handler = init_params.get("streaming_callback")
1✔
343
        if serialized_callback_handler:
1✔
344
            data["init_parameters"]["streaming_callback"] = deserialize_callable(serialized_callback_handler)
×
345
        return default_from_dict(cls, data)
1✔
346

347
    @component.output_types(replies=List[ChatMessage])
1✔
348
    def run(
1✔
349
        self,
350
        messages: List[ChatMessage],
351
        generation_kwargs: Optional[Dict[str, Any]] = None,
352
        tools: Optional[Union[List[Tool], Toolset]] = None,
353
        streaming_callback: Optional[StreamingCallbackT] = None,
354
    ):
355
        """
356
        Invoke the text generation inference based on the provided messages and generation parameters.
357

358
        :param messages:
359
            A list of ChatMessage objects representing the input messages.
360
        :param generation_kwargs:
361
            Additional keyword arguments for text generation.
362
        :param tools:
363
            A list of tools or a Toolset for which the model can prepare calls. If set, it will override
364
            the `tools` parameter set during component initialization. This parameter can accept either a
365
            list of `Tool` objects or a `Toolset` instance.
366
        :param streaming_callback:
367
            An optional callable for handling streaming responses. If set, it will override the `streaming_callback`
368
            parameter set during component initialization.
369
        :returns: A dictionary with the following keys:
370
            - `replies`: A list containing the generated responses as ChatMessage objects.
371
        """
372

373
        # update generation kwargs by merging with the default ones
374
        generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
1✔
375

376
        formatted_messages = [convert_message_to_hf_format(message) for message in messages]
1✔
377

378
        tools = tools or self.tools
1✔
379
        if tools and self.streaming_callback:
1✔
380
            raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.")
1✔
381
        _check_duplicate_tool_names(list(tools or []))
1✔
382

383
        # validate and select the streaming callback
384
        streaming_callback = select_streaming_callback(
1✔
385
            self.streaming_callback, streaming_callback, requires_async=False
386
        )
387

388
        if streaming_callback:
1✔
389
            return self._run_streaming(formatted_messages, generation_kwargs, streaming_callback)
1✔
390

391
        if tools and isinstance(tools, Toolset):
1✔
392
            tools = list(tools)
×
393

394
        hf_tools = _convert_tools_to_hfapi_tools(tools)
1✔
395

396
        return self._run_non_streaming(formatted_messages, generation_kwargs, hf_tools)
1✔
397

398
    @component.output_types(replies=List[ChatMessage])
1✔
399
    async def run_async(
1✔
400
        self,
401
        messages: List[ChatMessage],
402
        generation_kwargs: Optional[Dict[str, Any]] = None,
403
        tools: Optional[Union[List[Tool], Toolset]] = None,
404
        streaming_callback: Optional[StreamingCallbackT] = None,
405
    ):
406
        """
407
        Asynchronously invokes the text generation inference based on the provided messages and generation parameters.
408

409
        This is the asynchronous version of the `run` method. It has the same parameters
410
        and return values but can be used with `await` in an async code.
411

412
        :param messages:
413
            A list of ChatMessage objects representing the input messages.
414
        :param generation_kwargs:
415
            Additional keyword arguments for text generation.
416
        :param tools:
417
            A list of tools or a Toolset for which the model can prepare calls. If set, it will override the `tools`
418
            parameter set during component initialization. This parameter can accept either a list of `Tool` objects
419
            or a `Toolset` instance.
420
        :param streaming_callback:
421
            An optional callable for handling streaming responses. If set, it will override the `streaming_callback`
422
            parameter set during component initialization.
423
        :returns: A dictionary with the following keys:
424
            - `replies`: A list containing the generated responses as ChatMessage objects.
425
        """
426

427
        # update generation kwargs by merging with the default ones
428
        generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
1✔
429

430
        formatted_messages = [convert_message_to_hf_format(message) for message in messages]
1✔
431

432
        tools = tools or self.tools
1✔
433
        if tools and self.streaming_callback:
1✔
434
            raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.")
×
435
        _check_duplicate_tool_names(list(tools or []))
1✔
436

437
        # validate and select the streaming callback
438
        streaming_callback = select_streaming_callback(self.streaming_callback, streaming_callback, requires_async=True)
1✔
439

440
        if streaming_callback:
1✔
441
            return await self._run_streaming_async(formatted_messages, generation_kwargs, streaming_callback)
1✔
442

443
        if tools and isinstance(tools, Toolset):
1✔
444
            tools = list(tools)
×
445

446
        hf_tools = _convert_tools_to_hfapi_tools(tools)
1✔
447

448
        return await self._run_non_streaming_async(formatted_messages, generation_kwargs, hf_tools)
1✔
449

450
    def _run_streaming(
1✔
451
        self,
452
        messages: List[Dict[str, str]],
453
        generation_kwargs: Dict[str, Any],
454
        streaming_callback: SyncStreamingCallbackT,
455
    ):
456
        api_output: Iterable[ChatCompletionStreamOutput] = self._client.chat_completion(
1✔
457
            messages,
458
            stream=True,
459
            stream_options=ChatCompletionInputStreamOptions(include_usage=True),
460
            **generation_kwargs,
461
        )
462

463
        component_info = ComponentInfo.from_component(self)
1✔
464
        streaming_chunks: List[StreamingChunk] = []
1✔
465
        for chunk in api_output:
1✔
466
            streaming_chunk = _convert_chat_completion_stream_output_to_streaming_chunk(
1✔
467
                chunk=chunk, previous_chunks=streaming_chunks, component_info=component_info
468
            )
469
            streaming_chunks.append(streaming_chunk)
1✔
470
            streaming_callback(streaming_chunk)
1✔
471

472
        message = _convert_streaming_chunks_to_chat_message(chunks=streaming_chunks)
1✔
473
        if message.meta.get("usage") is None:
1✔
474
            message.meta["usage"] = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
475

476
        return {"replies": [message]}
1✔
477

478
    def _run_non_streaming(
1✔
479
        self,
480
        messages: List[Dict[str, str]],
481
        generation_kwargs: Dict[str, Any],
482
        tools: Optional[List["ChatCompletionInputTool"]] = None,
483
    ) -> Dict[str, List[ChatMessage]]:
484
        api_chat_output: ChatCompletionOutput = self._client.chat_completion(
1✔
485
            messages=messages, tools=tools, **generation_kwargs
486
        )
487

488
        if len(api_chat_output.choices) == 0:
1✔
489
            return {"replies": []}
×
490

491
        # n is unused, so the API always returns only one choice
492
        # the argument is probably allowed for compatibility with OpenAI
493
        # see https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.n
494
        choice = api_chat_output.choices[0]
1✔
495

496
        text = choice.message.content
1✔
497

498
        tool_calls = _convert_hfapi_tool_calls(choice.message.tool_calls)
1✔
499

500
        meta: Dict[str, Any] = {
1✔
501
            "model": self._client.model,
502
            "finish_reason": choice.finish_reason,
503
            "index": choice.index,
504
        }
505

506
        usage = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
507
        if api_chat_output.usage:
1✔
508
            usage = {
1✔
509
                "prompt_tokens": api_chat_output.usage.prompt_tokens,
510
                "completion_tokens": api_chat_output.usage.completion_tokens,
511
            }
512
        meta["usage"] = usage
1✔
513

514
        message = ChatMessage.from_assistant(text=text, tool_calls=tool_calls, meta=meta)
1✔
515
        return {"replies": [message]}
1✔
516

517
    async def _run_streaming_async(
1✔
518
        self,
519
        messages: List[Dict[str, str]],
520
        generation_kwargs: Dict[str, Any],
521
        streaming_callback: AsyncStreamingCallbackT,
522
    ):
523
        api_output: AsyncIterable[ChatCompletionStreamOutput] = await self._async_client.chat_completion(
1✔
524
            messages,
525
            stream=True,
526
            stream_options=ChatCompletionInputStreamOptions(include_usage=True),
527
            **generation_kwargs,
528
        )
529

530
        component_info = ComponentInfo.from_component(self)
1✔
531
        streaming_chunks: List[StreamingChunk] = []
1✔
532
        async for chunk in api_output:
1✔
533
            stream_chunk = _convert_chat_completion_stream_output_to_streaming_chunk(
1✔
534
                chunk=chunk, previous_chunks=streaming_chunks, component_info=component_info
535
            )
536
            streaming_chunks.append(stream_chunk)
1✔
537
            await streaming_callback(stream_chunk)  # type: ignore
1✔
538

539
        message = _convert_streaming_chunks_to_chat_message(chunks=streaming_chunks)
1✔
540
        if message.meta.get("usage") is None:
1✔
541
            message.meta["usage"] = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
542

543
        return {"replies": [message]}
1✔
544

545
    async def _run_non_streaming_async(
1✔
546
        self,
547
        messages: List[Dict[str, str]],
548
        generation_kwargs: Dict[str, Any],
549
        tools: Optional[List["ChatCompletionInputTool"]] = None,
550
    ) -> Dict[str, List[ChatMessage]]:
551
        api_chat_output: ChatCompletionOutput = await self._async_client.chat_completion(
1✔
552
            messages=messages, tools=tools, **generation_kwargs
553
        )
554

555
        if len(api_chat_output.choices) == 0:
1✔
556
            return {"replies": []}
×
557

558
        choice = api_chat_output.choices[0]
1✔
559

560
        text = choice.message.content
1✔
561

562
        tool_calls = _convert_hfapi_tool_calls(choice.message.tool_calls)
1✔
563

564
        meta: Dict[str, Any] = {
1✔
565
            "model": self._async_client.model,
566
            "finish_reason": choice.finish_reason,
567
            "index": choice.index,
568
        }
569

570
        usage = {"prompt_tokens": 0, "completion_tokens": 0}
1✔
571
        if api_chat_output.usage:
1✔
572
            usage = {
1✔
573
                "prompt_tokens": api_chat_output.usage.prompt_tokens,
574
                "completion_tokens": api_chat_output.usage.completion_tokens,
575
            }
576
        meta["usage"] = usage
1✔
577

578
        message = ChatMessage.from_assistant(text=text, tool_calls=tool_calls, meta=meta)
1✔
579
        return {"replies": [message]}
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc