• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 15186489091

22 May 2025 12:24PM UTC coverage: 90.423% (-0.03%) from 90.454%
15186489091

Pull #9406

github

web-flow
Merge 7a1615cb3 into e6a53b9dc
Pull Request #9406: feat: Extend AnswerBuilder for Agent

11104 of 12280 relevant lines covered (90.42%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

89.09
haystack/components/embedders/openai_text_embedder.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
import os
1✔
6
from typing import Any, Dict, List, Optional
1✔
7

8
from openai import AsyncOpenAI, OpenAI
1✔
9
from openai.types import CreateEmbeddingResponse
1✔
10

11
from haystack import component, default_from_dict, default_to_dict
1✔
12
from haystack.utils import Secret, deserialize_secrets_inplace
1✔
13
from haystack.utils.http_client import init_http_client
1✔
14

15

16
@component
1✔
17
class OpenAITextEmbedder:
1✔
18
    """
19
    Embeds strings using OpenAI models.
20

21
    You can use it to embed user query and send it to an embedding Retriever.
22

23
    ### Usage example
24

25
    ```python
26
    from haystack.components.embedders import OpenAITextEmbedder
27

28
    text_to_embed = "I love pizza!"
29

30
    text_embedder = OpenAITextEmbedder()
31

32
    print(text_embedder.run(text_to_embed))
33

34
    # {'embedding': [0.017020374536514282, -0.023255806416273117, ...],
35
    # 'meta': {'model': 'text-embedding-ada-002-v2',
36
    #          'usage': {'prompt_tokens': 4, 'total_tokens': 4}}}
37
    ```
38
    """
39

40
    def __init__(  # pylint: disable=too-many-positional-arguments
1✔
41
        self,
42
        api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
43
        model: str = "text-embedding-ada-002",
44
        dimensions: Optional[int] = None,
45
        api_base_url: Optional[str] = None,
46
        organization: Optional[str] = None,
47
        prefix: str = "",
48
        suffix: str = "",
49
        timeout: Optional[float] = None,
50
        max_retries: Optional[int] = None,
51
        http_client_kwargs: Optional[Dict[str, Any]] = None,
52
    ):
53
        """
54
        Creates an OpenAITextEmbedder component.
55

56
        Before initializing the component, you can set the 'OPENAI_TIMEOUT' and 'OPENAI_MAX_RETRIES'
57
        environment variables to override the `timeout` and `max_retries` parameters respectively
58
        in the OpenAI client.
59

60
        :param api_key:
61
            The OpenAI API key.
62
            You can set it with an environment variable `OPENAI_API_KEY`, or pass with this parameter
63
            during initialization.
64
        :param model:
65
            The name of the model to use for calculating embeddings.
66
            The default model is `text-embedding-ada-002`.
67
        :param dimensions:
68
            The number of dimensions of the resulting embeddings. Only `text-embedding-3` and
69
            later models support this parameter.
70
        :param api_base_url:
71
            Overrides default base URL for all HTTP requests.
72
        :param organization:
73
            Your organization ID. See OpenAI's
74
            [production best practices](https://platform.openai.com/docs/guides/production-best-practices/setting-up-your-organization)
75
            for more information.
76
        :param prefix:
77
            A string to add at the beginning of each text to embed.
78
        :param suffix:
79
            A string to add at the end of each text to embed.
80
        :param timeout:
81
            Timeout for OpenAI client calls. If not set, it defaults to either the
82
            `OPENAI_TIMEOUT` environment variable, or 30 seconds.
83
        :param max_retries:
84
            Maximum number of retries to contact OpenAI after an internal error.
85
            If not set, it defaults to either the `OPENAI_MAX_RETRIES` environment variable, or set to 5.
86
        :param http_client_kwargs:
87
            A dictionary of keyword arguments to configure a custom `httpx.Client`or `httpx.AsyncClient`.
88
            For more information, see the [HTTPX documentation](https://www.python-httpx.org/api/#client).
89
        """
90
        self.model = model
1✔
91
        self.dimensions = dimensions
1✔
92
        self.api_base_url = api_base_url
1✔
93
        self.organization = organization
1✔
94
        self.prefix = prefix
1✔
95
        self.suffix = suffix
1✔
96
        self.api_key = api_key
1✔
97
        self.timeout = timeout
1✔
98
        self.max_retries = max_retries
1✔
99
        self.http_client_kwargs = http_client_kwargs
1✔
100

101
        if timeout is None:
1✔
102
            timeout = float(os.environ.get("OPENAI_TIMEOUT", "30.0"))
1✔
103
        if max_retries is None:
1✔
104
            max_retries = int(os.environ.get("OPENAI_MAX_RETRIES", "5"))
1✔
105

106
        client_kwargs: Dict[str, Any] = {
1✔
107
            "api_key": api_key.resolve_value(),
108
            "organization": organization,
109
            "base_url": api_base_url,
110
            "timeout": timeout,
111
            "max_retries": max_retries,
112
        }
113

114
        self.client = OpenAI(http_client=init_http_client(self.http_client_kwargs, async_client=False), **client_kwargs)
1✔
115
        self.async_client = AsyncOpenAI(
1✔
116
            http_client=init_http_client(self.http_client_kwargs, async_client=True), **client_kwargs
117
        )
118

119
    def _get_telemetry_data(self) -> Dict[str, Any]:
1✔
120
        """
121
        Data that is sent to Posthog for usage analytics.
122
        """
123
        return {"model": self.model}
×
124

125
    def to_dict(self) -> Dict[str, Any]:
1✔
126
        """
127
        Serializes the component to a dictionary.
128

129
        :returns:
130
            Dictionary with serialized data.
131
        """
132
        return default_to_dict(
1✔
133
            self,
134
            api_key=self.api_key.to_dict(),
135
            model=self.model,
136
            dimensions=self.dimensions,
137
            api_base_url=self.api_base_url,
138
            organization=self.organization,
139
            prefix=self.prefix,
140
            suffix=self.suffix,
141
            timeout=self.timeout,
142
            max_retries=self.max_retries,
143
            http_client_kwargs=self.http_client_kwargs,
144
        )
145

146
    @classmethod
1✔
147
    def from_dict(cls, data: Dict[str, Any]) -> "OpenAITextEmbedder":
1✔
148
        """
149
        Deserializes the component from a dictionary.
150

151
        :param data:
152
            Dictionary to deserialize from.
153
        :returns:
154
            Deserialized component.
155
        """
156
        deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
1✔
157
        return default_from_dict(cls, data)
1✔
158

159
    def _prepare_input(self, text: str) -> Dict[str, Any]:
1✔
160
        if not isinstance(text, str):
1✔
161
            raise TypeError(
1✔
162
                "OpenAITextEmbedder expects a string as an input."
163
                "In case you want to embed a list of Documents, please use the OpenAIDocumentEmbedder."
164
            )
165

166
        text_to_embed = self.prefix + text + self.suffix
1✔
167

168
        kwargs: Dict[str, Any] = {"model": self.model, "input": text_to_embed}
1✔
169
        if self.dimensions is not None:
1✔
170
            kwargs["dimensions"] = self.dimensions
1✔
171
        return kwargs
1✔
172

173
    def _prepare_output(self, result: CreateEmbeddingResponse) -> Dict[str, Any]:
1✔
174
        return {"embedding": result.data[0].embedding, "meta": {"model": result.model, "usage": dict(result.usage)}}
1✔
175

176
    @component.output_types(embedding=List[float], meta=Dict[str, Any])
1✔
177
    def run(self, text: str):
1✔
178
        """
179
        Embeds a single string.
180

181
        :param text:
182
            Text to embed.
183

184
        :returns:
185
            A dictionary with the following keys:
186
            - `embedding`: The embedding of the input text.
187
            - `meta`: Information about the usage of the model.
188
        """
189
        create_kwargs = self._prepare_input(text=text)
1✔
190
        response = self.client.embeddings.create(**create_kwargs)
×
191
        return self._prepare_output(result=response)
×
192

193
    @component.output_types(embedding=List[float], meta=Dict[str, Any])
1✔
194
    async def run_async(self, text: str):
1✔
195
        """
196
        Asynchronously embed a single string.
197

198
        This is the asynchronous version of the `run` method. It has the same parameters and return values
199
        but can be used with `await` in async code.
200

201
        :param text:
202
            Text to embed.
203

204
        :returns:
205
            A dictionary with the following keys:
206
            - `embedding`: The embedding of the input text.
207
            - `meta`: Information about the usage of the model.
208
        """
209
        create_kwargs = self._prepare_input(text=text)
×
210
        response = await self.async_client.embeddings.create(**create_kwargs)
×
211
        return self._prepare_output(result=response)
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc