9568249476

Committed 18 Jun 2024 03:52PM UTC coverage: 89.872% (-0.1%) from 89.995%

Build # 9568249476

Build Type

push

github

Committed by

web-flow

Commit Message

ci: Add code formatting checks  (#7882)

* ruff settings

enable ruff format and re-format outdated files

feat: `EvaluationRunResult` add parameter to specify columns to keep in the comparative `Dataframe`  (#7879)

* adding param to explictily state which cols to keep

* adding param to explictily state which cols to keep

* adding param to explictily state which cols to keep

* updating tests

* adding release notes

* Update haystack/evaluation/eval_run_result.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update releasenotes/notes/add-keep-columns-to-EvalRunResult-comparative-be3e15ce45de3e0b.yaml

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* updating docstring

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

add format-check

fail on format and linting failures

fix string formatting

reformat long lines

fix tests

fix typing

linter

pull from main

* reformat

* lint -> check

* lint -> check

Run Details

6957 of 7741 relevant lines covered (89.87%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.1

haystack/components/embedders/hugging_face_api_document_embedder.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

import json
from typing import Any, Dict, List, Optional, Union

from tqdm import tqdm

from haystack import component, default_from_dict, default_to_dict, logging
from haystack.dataclasses import Document
from haystack.lazy_imports import LazyImport
from haystack.utils import Secret, deserialize_secrets_inplace
from haystack.utils.hf import HFEmbeddingAPIType, HFModelType, check_valid_model
from haystack.utils.url_validation import is_valid_http_url

with LazyImport(message="Run 'pip install \"huggingface_hub>=0.23.0\"'") as huggingface_hub_import:
    from huggingface_hub import InferenceClient

logger = logging.getLogger(__name__)


@component
class HuggingFaceAPIDocumentEmbedder:
    """
    A component that embeds documents using Hugging Face APIs.

    This component can be used to compute Document embeddings using different Hugging Face APIs:
    - [Free Serverless Inference API]((https://huggingface.co/inference-api)
    - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints)
    - [Self-hosted Text Embeddings Inference](https://github.com/huggingface/text-embeddings-inference)


    Example usage with the free Serverless Inference API:
    ```python
    from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
    from haystack.utils import Secret
    from haystack.dataclasses import Document

    doc = Document(content="I love pizza!")

    doc_embedder = HuggingFaceAPIDocumentEmbedder(api_type="serverless_inference_api",
                                                  api_params={"model": "BAAI/bge-small-en-v1.5"},
                                                  token=Secret.from_token("<your-api-key>"))

    result = document_embedder.run([doc])
    print(result["documents"][0].embedding)

    # [0.017020374536514282, -0.023255806416273117, ...]
    ```

    Example usage with paid Inference Endpoints:
    ```python
    from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
    from haystack.utils import Secret
    from haystack.dataclasses import Document

    doc = Document(content="I love pizza!")

    doc_embedder = HuggingFaceAPIDocumentEmbedder(api_type="inference_endpoints",
                                                  api_params={"url": "<your-inference-endpoint-url>"},
                                                  token=Secret.from_token("<your-api-key>"))

    result = document_embedder.run([doc])
    print(result["documents"][0].embedding)

    # [0.017020374536514282, -0.023255806416273117, ...]
    ```

    Example usage with self-hosted Text Embeddings Inference:
    ```python
    from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
    from haystack.dataclasses import Document

    doc = Document(content="I love pizza!")

    doc_embedder = HuggingFaceAPIDocumentEmbedder(api_type="text_embeddings_inference",
                                                  api_params={"url": "http://localhost:8080"})

    result = document_embedder.run([doc])
    print(result["documents"][0].embedding)

    # [0.017020374536514282, -0.023255806416273117, ...]
    ```
    """

    def __init__(
        self,
        api_type: Union[HFEmbeddingAPIType, str],
        api_params: Dict[str, str],
        token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False),
        prefix: str = "",
        suffix: str = "",
        truncate: bool = True,
        normalize: bool = False,
        batch_size: int = 32,
        progress_bar: bool = True,
        meta_fields_to_embed: Optional[List[str]] = None,
        embedding_separator: str = "\n",
    ):
        """
        Create an HuggingFaceAPITextEmbedder component.

        :param api_type:
            The type of Hugging Face API to use.
        :param api_params:
            A dictionary containing the following keys:
            - `model`: model ID on the Hugging Face Hub. Required when `api_type` is `SERVERLESS_INFERENCE_API`.
            - `url`: URL of the inference endpoint. Required when `api_type` is `INFERENCE_ENDPOINTS` or
                `TEXT_EMBEDDINGS_INFERENCE`.
        :param token: The HuggingFace token to use as HTTP bearer authorization.
            You can find your HF token in your [account settings](https://huggingface.co/settings/tokens).
        :param prefix:
            A string to add at the beginning of each text.
        :param suffix:
            A string to add at the end of each text.
        :param truncate:
            Truncate input text from the end to the maximum length supported by the model.
            This parameter takes effect when the `api_type` is `TEXT_EMBEDDINGS_INFERENCE`.
            It also takes effect when the `api_type` is `INFERENCE_ENDPOINTS` and the backend is based on Text
            Embeddings Inference. This parameter is ignored when the `api_type` is `SERVERLESS_INFERENCE_API`
            (it is always set to `True` and cannot be changed).
        :param normalize:
            Normalize the embeddings to unit length.
            This parameter takes effect when the `api_type` is `TEXT_EMBEDDINGS_INFERENCE`.
            It also takes effect when the `api_type` is `INFERENCE_ENDPOINTS` and the backend is based on Text
            Embeddings Inference. This parameter is ignored when the `api_type` is `SERVERLESS_INFERENCE_API`
            (it is always set to `False` and cannot be changed).
        :param batch_size:
            Number of Documents to process at once.
        :param progress_bar:
            If `True` shows a progress bar when running.
        :param meta_fields_to_embed:
            List of meta fields that will be embedded along with the Document text.
        :param embedding_separator:
            Separator used to concatenate the meta fields to the Document text.
        """
        huggingface_hub_import.check()

        if isinstance(api_type, str):
            api_type = HFEmbeddingAPIType.from_str(api_type)

        api_params = api_params or {}

        if api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API:
            model = api_params.get("model")
            if model is None:
                raise ValueError(
                    "To use the Serverless Inference API, you need to specify the `model` parameter in `api_params`."
                )
            check_valid_model(model, HFModelType.EMBEDDING, token)
            model_or_url = model
        elif api_type in [HFEmbeddingAPIType.INFERENCE_ENDPOINTS, HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE]:
            url = api_params.get("url")
            if url is None:
                msg = (
                    "To use Text Embeddings Inference or Inference Endpoints, you need to specify the `url` "
                    "parameter in `api_params`."
                )
                raise ValueError(msg)
            if not is_valid_http_url(url):
                raise ValueError(f"Invalid URL: {url}")
            model_or_url = url
        else:
            msg = f"Unknown api_type {api_type}"
            raise ValueError(api_type)

        self.api_type = api_type
        self.api_params = api_params
        self.token = token
        self.prefix = prefix
        self.suffix = suffix
        self.truncate = truncate
        self.normalize = normalize
        self.batch_size = batch_size
        self.progress_bar = progress_bar
        self.meta_fields_to_embed = meta_fields_to_embed or []
        self.embedding_separator = embedding_separator
        self._client = InferenceClient(model_or_url, token=token.resolve_value() if token else None)

    def to_dict(self) -> Dict[str, Any]:
        """
        Serializes the component to a dictionary.

        :returns:
            Dictionary with serialized data.
        """
        return default_to_dict(
            self,
            api_type=str(self.api_type),
            api_params=self.api_params,
            prefix=self.prefix,
            suffix=self.suffix,
            token=self.token.to_dict() if self.token else None,
            truncate=self.truncate,
            normalize=self.normalize,
            batch_size=self.batch_size,
            progress_bar=self.progress_bar,
            meta_fields_to_embed=self.meta_fields_to_embed,
            embedding_separator=self.embedding_separator,
        )

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "HuggingFaceAPIDocumentEmbedder":
        """
        Deserializes the component from a dictionary.

        :param data:
            Dictionary to deserialize from.
        :returns:
            Deserialized component.
        """
        deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
        return default_from_dict(cls, data)

    def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]:
        """
        Prepare the texts to embed by concatenating the Document text with the metadata fields to embed.
        """
        texts_to_embed = []
        for doc in documents:
            meta_values_to_embed = [
                str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key] is not None
            ]

            text_to_embed = (
                self.prefix + self.embedding_separator.join(meta_values_to_embed + [doc.content or ""]) + self.suffix
            )

            texts_to_embed.append(text_to_embed)
        return texts_to_embed

    def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> List[List[float]]:
        """
        Embed a list of texts in batches.
        """

        all_embeddings = []
        for i in tqdm(
            range(0, len(texts_to_embed), batch_size), disable=not self.progress_bar, desc="Calculating embeddings"
        ):
            batch = texts_to_embed[i : i + batch_size]
            response = self._client.post(
                json={"inputs": batch, "truncate": self.truncate, "normalize": self.normalize},
                task="feature-extraction",
            )
            embeddings = json.loads(response.decode())
            all_embeddings.extend(embeddings)

        return all_embeddings

    @component.output_types(documents=List[Document])
    def run(self, documents: List[Document]):
        """
        Embed a list of Documents.

        :param documents:
            Documents to embed.

        :returns:
            A dictionary with the following keys:
            - `documents`: Documents with embeddings
        """
        if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
            raise TypeError(
                "HuggingFaceAPIDocumentEmbedder expects a list of Documents as input."
                " In case you want to embed a string, please use the HuggingFaceAPITextEmbedder."
            )

        texts_to_embed = self._prepare_texts_to_embed(documents=documents)

        embeddings = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self.batch_size)

        for doc, emb in zip(documents, embeddings):
            doc.embedding = emb

        return {"documents": documents}

1	# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2	#
3	# SPDX-License-Identifier: Apache-2.0
4
5	import json	1✔
6	from typing import Any, Dict, List, Optional, Union	1✔
7
8	from tqdm import tqdm	1✔
9
10	from haystack import component, default_from_dict, default_to_dict, logging	1✔
11	from haystack.dataclasses import Document	1✔
12	from haystack.lazy_imports import LazyImport	1✔
13	from haystack.utils import Secret, deserialize_secrets_inplace	1✔
14	from haystack.utils.hf import HFEmbeddingAPIType, HFModelType, check_valid_model	1✔
15	from haystack.utils.url_validation import is_valid_http_url	1✔
16
17	with LazyImport(message="Run 'pip install \"huggingface_hub>=0.23.0\"'") as huggingface_hub_import:	1✔
18	from huggingface_hub import InferenceClient	1✔
19
20	logger = logging.getLogger(__name__)	1✔
21
22
23	@component	1✔
24	class HuggingFaceAPIDocumentEmbedder:	1✔
25	"""
26	A component that embeds documents using Hugging Face APIs.
27
28	This component can be used to compute Document embeddings using different Hugging Face APIs:
29	- [Free Serverless Inference API]((https://huggingface.co/inference-api)
30	- [Paid Inference Endpoints](https://huggingface.co/inference-endpoints)
31	- [Self-hosted Text Embeddings Inference](https://github.com/huggingface/text-embeddings-inference)
32
33
34	Example usage with the free Serverless Inference API:
35	```python
36	from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
37	from haystack.utils import Secret
38	from haystack.dataclasses import Document
39
40	doc = Document(content="I love pizza!")
41
42	doc_embedder = HuggingFaceAPIDocumentEmbedder(api_type="serverless_inference_api",
43	api_params={"model": "BAAI/bge-small-en-v1.5"},
44	token=Secret.from_token("<your-api-key>"))
45
46	result = document_embedder.run([doc])
47	print(result["documents"][0].embedding)
48
49	# [0.017020374536514282, -0.023255806416273117, ...]
50	```
51
52	Example usage with paid Inference Endpoints:
53	```python
54	from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
55	from haystack.utils import Secret
56	from haystack.dataclasses import Document
57
58	doc = Document(content="I love pizza!")
59
60	doc_embedder = HuggingFaceAPIDocumentEmbedder(api_type="inference_endpoints",
61	api_params={"url": "<your-inference-endpoint-url>"},
62	token=Secret.from_token("<your-api-key>"))
63
64	result = document_embedder.run([doc])
65	print(result["documents"][0].embedding)
66
67	# [0.017020374536514282, -0.023255806416273117, ...]
68	```
69
70	Example usage with self-hosted Text Embeddings Inference:
71	```python
72	from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
73	from haystack.dataclasses import Document
74
75	doc = Document(content="I love pizza!")
76
77	doc_embedder = HuggingFaceAPIDocumentEmbedder(api_type="text_embeddings_inference",
78	api_params={"url": "http://localhost:8080"})
79
80	result = document_embedder.run([doc])
81	print(result["documents"][0].embedding)
82
83	# [0.017020374536514282, -0.023255806416273117, ...]
84	```
85	"""
86
87	def __init__(	1✔
88	self,
89	api_type: Union[HFEmbeddingAPIType, str],
90	api_params: Dict[str, str],
91	token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN", strict=False),
92	prefix: str = "",
93	suffix: str = "",
94	truncate: bool = True,
95	normalize: bool = False,
96	batch_size: int = 32,
97	progress_bar: bool = True,
98	meta_fields_to_embed: Optional[List[str]] = None,
99	embedding_separator: str = "\n",
100	):
101	"""
102	Create an HuggingFaceAPITextEmbedder component.
103
104	:param api_type:
105	The type of Hugging Face API to use.
106	:param api_params:
107	A dictionary containing the following keys:
108	- `model`: model ID on the Hugging Face Hub. Required when `api_type` is `SERVERLESS_INFERENCE_API`.
109	- `url`: URL of the inference endpoint. Required when `api_type` is `INFERENCE_ENDPOINTS` or
110	`TEXT_EMBEDDINGS_INFERENCE`.
111	:param token: The HuggingFace token to use as HTTP bearer authorization.
112	You can find your HF token in your [account settings](https://huggingface.co/settings/tokens).
113	:param prefix:
114	A string to add at the beginning of each text.
115	:param suffix:
116	A string to add at the end of each text.
117	:param truncate:
118	Truncate input text from the end to the maximum length supported by the model.
119	This parameter takes effect when the `api_type` is `TEXT_EMBEDDINGS_INFERENCE`.
120	It also takes effect when the `api_type` is `INFERENCE_ENDPOINTS` and the backend is based on Text
121	Embeddings Inference. This parameter is ignored when the `api_type` is `SERVERLESS_INFERENCE_API`
122	(it is always set to `True` and cannot be changed).
123	:param normalize:
124	Normalize the embeddings to unit length.
125	This parameter takes effect when the `api_type` is `TEXT_EMBEDDINGS_INFERENCE`.
126	It also takes effect when the `api_type` is `INFERENCE_ENDPOINTS` and the backend is based on Text
127	Embeddings Inference. This parameter is ignored when the `api_type` is `SERVERLESS_INFERENCE_API`
128	(it is always set to `False` and cannot be changed).
129	:param batch_size:
130	Number of Documents to process at once.
131	:param progress_bar:
132	If `True` shows a progress bar when running.
133	:param meta_fields_to_embed:
134	List of meta fields that will be embedded along with the Document text.
135	:param embedding_separator:
136	Separator used to concatenate the meta fields to the Document text.
137	"""
138	huggingface_hub_import.check()	1✔
139
140	if isinstance(api_type, str):	1✔
141	api_type = HFEmbeddingAPIType.from_str(api_type)	1✔
142
143	api_params = api_params or {}	1✔
144
145	if api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API:	1✔
146	model = api_params.get("model")	1✔
147	if model is None:	1✔
148	raise ValueError(	1✔
149	"To use the Serverless Inference API, you need to specify the `model` parameter in `api_params`."
150	)
151	check_valid_model(model, HFModelType.EMBEDDING, token)	1✔
152	model_or_url = model	1✔
153	elif api_type in [HFEmbeddingAPIType.INFERENCE_ENDPOINTS, HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE]:	1✔
154	url = api_params.get("url")	1✔
155	if url is None:	1✔
156	msg = (	1✔
157	"To use Text Embeddings Inference or Inference Endpoints, you need to specify the `url` "
158	"parameter in `api_params`."
159	)
160	raise ValueError(msg)	1✔
161	if not is_valid_http_url(url):	1✔
162	raise ValueError(f"Invalid URL: {url}")	1✔
163	model_or_url = url	1✔
164	else:
165	msg = f"Unknown api_type {api_type}"	×
166	raise ValueError(api_type)	×
167
168	self.api_type = api_type	1✔
169	self.api_params = api_params	1✔
170	self.token = token	1✔
171	self.prefix = prefix	1✔
172	self.suffix = suffix	1✔
173	self.truncate = truncate	1✔
174	self.normalize = normalize	1✔
175	self.batch_size = batch_size	1✔
176	self.progress_bar = progress_bar	1✔
177	self.meta_fields_to_embed = meta_fields_to_embed or []	1✔
178	self.embedding_separator = embedding_separator	1✔
179	self._client = InferenceClient(model_or_url, token=token.resolve_value() if token else None)	1✔
180
181	def to_dict(self) -> Dict[str, Any]:	1✔
182	"""
183	Serializes the component to a dictionary.
184
185	:returns:
186	Dictionary with serialized data.
187	"""
188	return default_to_dict(	1✔
189	self,
190	api_type=str(self.api_type),
191	api_params=self.api_params,
192	prefix=self.prefix,
193	suffix=self.suffix,
194	token=self.token.to_dict() if self.token else None,
195	truncate=self.truncate,
196	normalize=self.normalize,
197	batch_size=self.batch_size,
198	progress_bar=self.progress_bar,
199	meta_fields_to_embed=self.meta_fields_to_embed,
200	embedding_separator=self.embedding_separator,
201	)
202
203	@classmethod	1✔
204	def from_dict(cls, data: Dict[str, Any]) -> "HuggingFaceAPIDocumentEmbedder":	1✔
205	"""
206	Deserializes the component from a dictionary.
207
208	:param data:
209	Dictionary to deserialize from.
210	:returns:
211	Deserialized component.
212	"""
213	deserialize_secrets_inplace(data["init_parameters"], keys=["token"])	1✔
214	return default_from_dict(cls, data)	1✔
215
216	def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]:	1✔
217	"""
218	Prepare the texts to embed by concatenating the Document text with the metadata fields to embed.
219	"""
220	texts_to_embed = []	1✔
221	for doc in documents:	1✔
222	meta_values_to_embed = [	1✔
223	str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key] is not None
224	]
225
226	text_to_embed = (	1✔
227	self.prefix + self.embedding_separator.join(meta_values_to_embed + [doc.content or ""]) + self.suffix
228	)
229
230	texts_to_embed.append(text_to_embed)	1✔
231	return texts_to_embed	1✔
232
233	def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> List[List[float]]:	1✔
234	"""
235	Embed a list of texts in batches.
236	"""
237
238	all_embeddings = []	1✔
239	for i in tqdm(	1✔
240	range(0, len(texts_to_embed), batch_size), disable=not self.progress_bar, desc="Calculating embeddings"
241	):
242	batch = texts_to_embed[i : i + batch_size]	1✔
243	response = self._client.post(	1✔
244	json={"inputs": batch, "truncate": self.truncate, "normalize": self.normalize},
245	task="feature-extraction",
246	)
247	embeddings = json.loads(response.decode())	1✔
248	all_embeddings.extend(embeddings)	1✔
249
250	return all_embeddings	1✔
251
252	@component.output_types(documents=List[Document])	1✔
253	def run(self, documents: List[Document]):	1✔
254	"""
255	Embed a list of Documents.
256
257	:param documents:
258	Documents to embed.
259
260	:returns:
261	A dictionary with the following keys:
262	- `documents`: Documents with embeddings
263	"""
264	if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):	1✔
265	raise TypeError(	×
266	"HuggingFaceAPIDocumentEmbedder expects a list of Documents as input."
267	" In case you want to embed a string, please use the HuggingFaceAPITextEmbedder."
268	)
269
270	texts_to_embed = self._prepare_texts_to_embed(documents=documents)	1✔
271
272	embeddings = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self.batch_size)	1✔
273
274	for doc, emb in zip(documents, embeddings):	1✔
275	doc.embedding = emb	1✔
276
277	return {"documents": documents}	1✔

deepset-ai / haystack / 9568249476

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous