• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 14591523099

22 Apr 2025 09:32AM UTC coverage: 90.514% (+0.01%) from 90.504%
14591523099

Pull #9270

github

web-flow
Merge 42a33e27f into 114b4568b
Pull Request #9270: feat: Allow OpenAI client config in other components

10830 of 11965 relevant lines covered (90.51%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

88.68
haystack/components/embedders/openai_text_embedder.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
import os
1✔
6
from typing import Any, Dict, List, Optional
1✔
7

8
from openai import AsyncOpenAI, OpenAI
1✔
9
from openai.types import CreateEmbeddingResponse
1✔
10

11
from haystack import component, default_from_dict, default_to_dict
1✔
12
from haystack.utils import Secret, deserialize_secrets_inplace
1✔
13
from haystack.utils.http_client import init_http_client
1✔
14

15

16
@component
1✔
17
class OpenAITextEmbedder:
1✔
18
    """
19
    Embeds strings using OpenAI models.
20

21
    You can use it to embed user query and send it to an embedding Retriever.
22

23
    ### Usage example
24

25
    ```python
26
    from haystack.components.embedders import OpenAITextEmbedder
27

28
    text_to_embed = "I love pizza!"
29

30
    text_embedder = OpenAITextEmbedder()
31

32
    print(text_embedder.run(text_to_embed))
33

34
    # {'embedding': [0.017020374536514282, -0.023255806416273117, ...],
35
    # 'meta': {'model': 'text-embedding-ada-002-v2',
36
    #          'usage': {'prompt_tokens': 4, 'total_tokens': 4}}}
37
    ```
38
    """
39

40
    def __init__(  # pylint: disable=too-many-positional-arguments
1✔
41
        self,
42
        api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
43
        model: str = "text-embedding-ada-002",
44
        dimensions: Optional[int] = None,
45
        api_base_url: Optional[str] = None,
46
        organization: Optional[str] = None,
47
        prefix: str = "",
48
        suffix: str = "",
49
        timeout: Optional[float] = None,
50
        max_retries: Optional[int] = None,
51
        http_client_kwargs: Optional[Dict[str, Any]] = None,
52
    ):
53
        """
54
        Creates an OpenAITextEmbedder component.
55

56
        Before initializing the component, you can set the 'OPENAI_TIMEOUT' and 'OPENAI_MAX_RETRIES'
57
        environment variables to override the `timeout` and `max_retries` parameters respectively
58
        in the OpenAI client.
59

60
        :param api_key:
61
            The OpenAI API key.
62
            You can set it with an environment variable `OPENAI_API_KEY`, or pass with this parameter
63
            during initialization.
64
        :param model:
65
            The name of the model to use for calculating embeddings.
66
            The default model is `text-embedding-ada-002`.
67
        :param dimensions:
68
            The number of dimensions of the resulting embeddings. Only `text-embedding-3` and
69
            later models support this parameter.
70
        :param api_base_url:
71
            Overrides default base URL for all HTTP requests.
72
        :param organization:
73
            Your organization ID. See OpenAI's
74
            [production best practices](https://platform.openai.com/docs/guides/production-best-practices/setting-up-your-organization)
75
            for more information.
76
        :param prefix:
77
            A string to add at the beginning of each text to embed.
78
        :param suffix:
79
            A string to add at the end of each text to embed.
80
        :param timeout:
81
            Timeout for OpenAI client calls. If not set, it defaults to either the
82
            `OPENAI_TIMEOUT` environment variable, or 30 seconds.
83
        :param max_retries:
84
            Maximum number of retries to contact OpenAI after an internal error.
85
            If not set, it defaults to either the `OPENAI_MAX_RETRIES` environment variable, or set to 5.
86
        :param http_client_kwargs:
87
            A dictionary of keyword arguments to configure a custom `httpx.Client`or `httpx.AsyncClient`.
88
            For more information, see the [HTTPX documentation](https://www.python-httpx.org/api/#client).
89
        """
90
        self.model = model
1✔
91
        self.dimensions = dimensions
1✔
92
        self.api_base_url = api_base_url
1✔
93
        self.organization = organization
1✔
94
        self.prefix = prefix
1✔
95
        self.suffix = suffix
1✔
96
        self.api_key = api_key
1✔
97
        self.http_client_kwargs = http_client_kwargs
1✔
98

99
        if timeout is None:
1✔
100
            timeout = float(os.environ.get("OPENAI_TIMEOUT", "30.0"))
1✔
101
        if max_retries is None:
1✔
102
            max_retries = int(os.environ.get("OPENAI_MAX_RETRIES", "5"))
1✔
103

104
        client_kwargs: Dict[str, Any] = {
1✔
105
            "api_key": api_key.resolve_value(),
106
            "organization": organization,
107
            "base_url": api_base_url,
108
            "timeout": timeout,
109
            "max_retries": max_retries,
110
        }
111

112
        self.client = OpenAI(http_client=init_http_client(self.http_client_kwargs, async_client=False), **client_kwargs)
1✔
113
        self.async_client = AsyncOpenAI(
1✔
114
            http_client=init_http_client(self.http_client_kwargs, async_client=True), **client_kwargs
115
        )
116

117
    def _get_telemetry_data(self) -> Dict[str, Any]:
1✔
118
        """
119
        Data that is sent to Posthog for usage analytics.
120
        """
121
        return {"model": self.model}
×
122

123
    def to_dict(self) -> Dict[str, Any]:
1✔
124
        """
125
        Serializes the component to a dictionary.
126

127
        :returns:
128
            Dictionary with serialized data.
129
        """
130
        return default_to_dict(
1✔
131
            self,
132
            model=self.model,
133
            api_base_url=self.api_base_url,
134
            organization=self.organization,
135
            prefix=self.prefix,
136
            suffix=self.suffix,
137
            dimensions=self.dimensions,
138
            api_key=self.api_key.to_dict(),
139
            http_client_kwargs=self.http_client_kwargs,
140
        )
141

142
    @classmethod
1✔
143
    def from_dict(cls, data: Dict[str, Any]) -> "OpenAITextEmbedder":
1✔
144
        """
145
        Deserializes the component from a dictionary.
146

147
        :param data:
148
            Dictionary to deserialize from.
149
        :returns:
150
            Deserialized component.
151
        """
152
        deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
1✔
153
        return default_from_dict(cls, data)
1✔
154

155
    def _prepare_input(self, text: str) -> Dict[str, Any]:
1✔
156
        if not isinstance(text, str):
1✔
157
            raise TypeError(
1✔
158
                "OpenAITextEmbedder expects a string as an input."
159
                "In case you want to embed a list of Documents, please use the OpenAIDocumentEmbedder."
160
            )
161

162
        text_to_embed = self.prefix + text + self.suffix
1✔
163

164
        kwargs: Dict[str, Any] = {"model": self.model, "input": text_to_embed}
1✔
165
        if self.dimensions is not None:
1✔
166
            kwargs["dimensions"] = self.dimensions
1✔
167
        return kwargs
1✔
168

169
    def _prepare_output(self, result: CreateEmbeddingResponse) -> Dict[str, Any]:
1✔
170
        return {"embedding": result.data[0].embedding, "meta": {"model": result.model, "usage": dict(result.usage)}}
1✔
171

172
    @component.output_types(embedding=List[float], meta=Dict[str, Any])
1✔
173
    def run(self, text: str):
1✔
174
        """
175
        Embeds a single string.
176

177
        :param text:
178
            Text to embed.
179

180
        :returns:
181
            A dictionary with the following keys:
182
            - `embedding`: The embedding of the input text.
183
            - `meta`: Information about the usage of the model.
184
        """
185
        create_kwargs = self._prepare_input(text=text)
1✔
186
        response = self.client.embeddings.create(**create_kwargs)
×
187
        return self._prepare_output(result=response)
×
188

189
    @component.output_types(embedding=List[float], meta=Dict[str, Any])
1✔
190
    async def run_async(self, text: str):
1✔
191
        """
192
        Asynchronously embed a single string.
193

194
        This is the asynchronous version of the `run` method. It has the same parameters and return values
195
        but can be used with `await` in async code.
196

197
        :param text:
198
            Text to embed.
199

200
        :returns:
201
            A dictionary with the following keys:
202
            - `embedding`: The embedding of the input text.
203
            - `meta`: Information about the usage of the model.
204
        """
205
        create_kwargs = self._prepare_input(text=text)
×
206
        response = await self.async_client.embeddings.create(**create_kwargs)
×
207
        return self._prepare_output(result=response)
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc