12240140835

Committed 09 Dec 2024 04:39PM UTC coverage: 90.335% (+0.001%) from 90.334%

Build # 12240140835

Build Type

Pull #8610

github

Committed by

web-flow

Commit Message

Merge 3ff0aa0e9 into 6f983a22c

Pull Request Pull Request #8610: chore: fixing `pylint` issues

Run Details

8038 of 8898 relevant lines covered (90.33%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

63.33

haystack/components/evaluators/sas_evaluator.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from typing import Any, Dict, List, Optional

from numpy import mean as np_mean

from haystack import component, default_from_dict, default_to_dict
from haystack.lazy_imports import LazyImport
from haystack.utils import ComponentDevice, expit
from haystack.utils.auth import Secret, deserialize_secrets_inplace

with LazyImport(message="Run 'pip install \"sentence-transformers>=3.0.0\"'") as sas_import:
    from sentence_transformers import CrossEncoder, SentenceTransformer, util
    from transformers import AutoConfig


@component
class SASEvaluator:
    """
    SASEvaluator computes the Semantic Answer Similarity (SAS) between a list of predictions and a one of ground truths.

    It's usually used in Retrieval Augmented Generation (RAG) pipelines to evaluate the quality of the generated
    answers. The SAS is computed using a pre-trained model from the Hugging Face model hub. The model can be either a
    Bi-Encoder or a Cross-Encoder. The choice of the model is based on the `model` parameter.

    Usage example:
    ```python
    from haystack.components.evaluators.sas_evaluator import SASEvaluator

    evaluator = SASEvaluator(model="cross-encoder/ms-marco-MiniLM-L-6-v2")
    evaluator.warm_up()
    ground_truths = [
        "A construction budget of US $2.3 billion",
        "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
        "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
    ]
    predictions = [
        "A construction budget of US $2.3 billion",
        "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
        "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
    ]
    result = evaluator.run(
        ground_truths_answers=ground_truths, predicted_answers=predictions
    )

    print(result["score"])
    # 0.9999673763910929

    print(result["individual_scores"])
    # [0.9999765157699585, 0.999968409538269, 0.9999572038650513]
    ```
    """

    def __init__(
        self,
        model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
        batch_size: int = 32,
        device: Optional[ComponentDevice] = None,
        token: Secret = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
    ):
        """
        Creates a new instance of SASEvaluator.

        :param model:
            SentenceTransformers semantic textual similarity model, should be path or string pointing to a downloadable
            model.
        :param batch_size:
            Number of prediction-label pairs to encode at once.
        :param device:
            The device on which the model is loaded. If `None`, the default device is automatically selected.
        :param token:
            The Hugging Face token for HTTP bearer authorization.
            You can find your HF token in your [account settings](https://huggingface.co/settings/tokens)
        """
        sas_import.check()

        self._model = model
        self._batch_size = batch_size
        self._device = device
        self._token = token
        self._similarity_model = None

    def to_dict(self) -> Dict[str, Any]:
        """
        Serialize this component to a dictionary.

        :returns:
            The serialized component as a dictionary.
        """
        return default_to_dict(
            self,
            model=self._model,
            batch_size=self._batch_size,
            device=self._device.to_dict() if self._device else None,
            token=self._token.to_dict() if self._token else None,
        )

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "SASEvaluator":
        """
        Deserialize this component from a dictionary.

        :param data:
            The dictionary representation of this component.
        :returns:
            The deserialized component instance.
        """
        deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
        if device := data.get("init_parameters", {}).get("device"):
            data["init_parameters"]["device"] = ComponentDevice.from_dict(device)
        return default_from_dict(cls, data)

    def warm_up(self):
        """
        Initializes the component.
        """
        if self._similarity_model:
            return

        token = self._token.resolve_value() if self._token else None
        config = AutoConfig.from_pretrained(self._model, use_auth_token=token)
        cross_encoder_used = False
        if config.architectures:
            cross_encoder_used = any(arch.endswith("ForSequenceClassification") for arch in config.architectures)
        device = ComponentDevice.resolve_device(self._device).to_torch_str()
        # Based on the Model string we can load either Bi-Encoders or Cross Encoders.
        # Similarity computation changes for both approaches
        if cross_encoder_used:
            self._similarity_model = CrossEncoder(
                self._model,
                device=device,
                tokenizer_args={"use_auth_token": token},
                automodel_args={"use_auth_token": token},
            )
        else:
            self._similarity_model = SentenceTransformer(self._model, device=device, use_auth_token=token)

    @component.output_types(score=float, individual_scores=List[float])
    def run(self, ground_truth_answers: List[str], predicted_answers: List[str]) -> Dict[str, Any]:
        """
        SASEvaluator component run method.

        Run the SASEvaluator to compute the Semantic Answer Similarity (SAS) between a list of predicted answers
        and a list of ground truth answers. Both must be list of strings of same length.

        :param ground_truth_answers:
            A list of expected answers for each question.
        :param predicted_answers:
            A list of generated answers for each question.
        :returns:
            A dictionary with the following outputs:
                - `score`: Mean SAS score over all the predictions/ground-truth pairs.
                - `individual_scores`: A list of similarity scores for each prediction/ground-truth pair.
        """
        if len(ground_truth_answers) != len(predicted_answers):
            raise ValueError("The number of predictions and labels must be the same.")

        if any(answer is None for answer in predicted_answers):
            raise ValueError("Predicted answers must not contain None values.")

        if len(predicted_answers) == 0:
            return {"score": 0.0, "individual_scores": [0.0]}

        if not self._similarity_model:
            msg = "The model has not been initialized. Call warm_up() before running the evaluator."
            raise RuntimeError(msg)

        if isinstance(self._similarity_model, CrossEncoder):
            # For Cross Encoders we create a list of pairs of predictions and labels
            sentence_pairs = list(zip(predicted_answers, ground_truth_answers))
            similarity_scores = self._similarity_model.predict(
                sentence_pairs, batch_size=self._batch_size, convert_to_numpy=True
            )

            # All Cross Encoders do not return a set of logits scores that are normalized
            # We normalize scores if they are larger than 1
            if (similarity_scores > 1).any():
                similarity_scores = expit(similarity_scores)

            # Convert scores to list of floats from numpy array
            similarity_scores = similarity_scores.tolist()

        else:
            # For Bi-encoders we create embeddings separately for predictions and labels
            predictions_embeddings = self._similarity_model.encode(
                predicted_answers, batch_size=self._batch_size, convert_to_tensor=True
            )
            label_embeddings = self._similarity_model.encode(
                ground_truth_answers, batch_size=self._batch_size, convert_to_tensor=True
            )

            # Compute cosine-similarities
            similarity_scores = [
                float(util.cos_sim(p, l).cpu().numpy()) for p, l in zip(predictions_embeddings, label_embeddings)
            ]

        sas_score = np_mean(similarity_scores)

        return {"score": sas_score, "individual_scores": similarity_scores}

1	# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2	#
3	# SPDX-License-Identifier: Apache-2.0
4
5	from typing import Any, Dict, List, Optional	1✔
6
7	from numpy import mean as np_mean	1✔
8
9	from haystack import component, default_from_dict, default_to_dict	1✔
10	from haystack.lazy_imports import LazyImport	1✔
11	from haystack.utils import ComponentDevice, expit	1✔
12	from haystack.utils.auth import Secret, deserialize_secrets_inplace	1✔
13
14	with LazyImport(message="Run 'pip install \"sentence-transformers>=3.0.0\"'") as sas_import:	1✔
15	from sentence_transformers import CrossEncoder, SentenceTransformer, util	1✔
16	from transformers import AutoConfig	1✔
17
18
19	@component	1✔
20	class SASEvaluator:	1✔
21	"""
22	SASEvaluator computes the Semantic Answer Similarity (SAS) between a list of predictions and a one of ground truths.
23
24	It's usually used in Retrieval Augmented Generation (RAG) pipelines to evaluate the quality of the generated
25	answers. The SAS is computed using a pre-trained model from the Hugging Face model hub. The model can be either a
26	Bi-Encoder or a Cross-Encoder. The choice of the model is based on the `model` parameter.
27
28	Usage example:
29	```python
30	from haystack.components.evaluators.sas_evaluator import SASEvaluator
31
32	evaluator = SASEvaluator(model="cross-encoder/ms-marco-MiniLM-L-6-v2")
33	evaluator.warm_up()
34	ground_truths = [
35	"A construction budget of US $2.3 billion",
36	"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
37	"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
38	]
39	predictions = [
40	"A construction budget of US $2.3 billion",
41	"The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
42	"The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
43	]
44	result = evaluator.run(
45	ground_truths_answers=ground_truths, predicted_answers=predictions
46	)
47
48	print(result["score"])
49	# 0.9999673763910929
50
51	print(result["individual_scores"])
52	# [0.9999765157699585, 0.999968409538269, 0.9999572038650513]
53	```
54	"""
55
56	def __init__(	1✔
57	self,
58	model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
59	batch_size: int = 32,
60	device: Optional[ComponentDevice] = None,
61	token: Secret = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
62	):
63	"""
64	Creates a new instance of SASEvaluator.
65
66	:param model:
67	SentenceTransformers semantic textual similarity model, should be path or string pointing to a downloadable
68	model.
69	:param batch_size:
70	Number of prediction-label pairs to encode at once.
71	:param device:
72	The device on which the model is loaded. If `None`, the default device is automatically selected.
73	:param token:
74	The Hugging Face token for HTTP bearer authorization.
75	You can find your HF token in your [account settings](https://huggingface.co/settings/tokens)
76	"""
77	sas_import.check()	1✔
78
79	self._model = model	1✔
80	self._batch_size = batch_size	1✔
81	self._device = device	1✔
82	self._token = token	1✔
83	self._similarity_model = None	1✔
84
85	def to_dict(self) -> Dict[str, Any]:	1✔
86	"""
87	Serialize this component to a dictionary.
88
89	:returns:
90	The serialized component as a dictionary.
91	"""
92	return default_to_dict(	1✔
93	self,
94	model=self._model,
95	batch_size=self._batch_size,
96	device=self._device.to_dict() if self._device else None,
97	token=self._token.to_dict() if self._token else None,
98	)
99
100	@classmethod	1✔
101	def from_dict(cls, data: Dict[str, Any]) -> "SASEvaluator":	1✔
102	"""
103	Deserialize this component from a dictionary.
104
105	:param data:
106	The dictionary representation of this component.
107	:returns:
108	The deserialized component instance.
109	"""
110	deserialize_secrets_inplace(data["init_parameters"], keys=["token"])	1✔
111	if device := data.get("init_parameters", {}).get("device"):	1✔
112	data["init_parameters"]["device"] = ComponentDevice.from_dict(device)	1✔
113	return default_from_dict(cls, data)	1✔
114
115	def warm_up(self):	1✔
116	"""
117	Initializes the component.
118	"""
119	if self._similarity_model:	×
120	return	×
121
122	token = self._token.resolve_value() if self._token else None	×
123	config = AutoConfig.from_pretrained(self._model, use_auth_token=token)	×
124	cross_encoder_used = False	×
125	if config.architectures:	×
126	cross_encoder_used = any(arch.endswith("ForSequenceClassification") for arch in config.architectures)	×
127	device = ComponentDevice.resolve_device(self._device).to_torch_str()	×
128	# Based on the Model string we can load either Bi-Encoders or Cross Encoders.
129	# Similarity computation changes for both approaches
130	if cross_encoder_used:	×
131	self._similarity_model = CrossEncoder(	×
132	self._model,
133	device=device,
134	tokenizer_args={"use_auth_token": token},
135	automodel_args={"use_auth_token": token},
136	)
137	else:
138	self._similarity_model = SentenceTransformer(self._model, device=device, use_auth_token=token)	×
139
140	@component.output_types(score=float, individual_scores=List[float])	1✔
141	def run(self, ground_truth_answers: List[str], predicted_answers: List[str]) -> Dict[str, Any]:	1✔
142	"""
143	SASEvaluator component run method.
144
145	Run the SASEvaluator to compute the Semantic Answer Similarity (SAS) between a list of predicted answers
146	and a list of ground truth answers. Both must be list of strings of same length.
147
148	:param ground_truth_answers:
149	A list of expected answers for each question.
150	:param predicted_answers:
151	A list of generated answers for each question.
152	:returns:
153	A dictionary with the following outputs:
154	- `score`: Mean SAS score over all the predictions/ground-truth pairs.
155	- `individual_scores`: A list of similarity scores for each prediction/ground-truth pair.
156	"""
157	if len(ground_truth_answers) != len(predicted_answers):	1✔
158	raise ValueError("The number of predictions and labels must be the same.")	1✔
159
160	if any(answer is None for answer in predicted_answers):	1✔
161	raise ValueError("Predicted answers must not contain None values.")	1✔
162
163	if len(predicted_answers) == 0:	1✔
164	return {"score": 0.0, "individual_scores": [0.0]}	1✔
165
166	if not self._similarity_model:	1✔
167	msg = "The model has not been initialized. Call warm_up() before running the evaluator."	1✔
168	raise RuntimeError(msg)	1✔
169
170	if isinstance(self._similarity_model, CrossEncoder):	×
171	# For Cross Encoders we create a list of pairs of predictions and labels
172	sentence_pairs = list(zip(predicted_answers, ground_truth_answers))	×
173	similarity_scores = self._similarity_model.predict(	×
174	sentence_pairs, batch_size=self._batch_size, convert_to_numpy=True
175	)
176
177	# All Cross Encoders do not return a set of logits scores that are normalized
178	# We normalize scores if they are larger than 1
179	if (similarity_scores > 1).any():	×
180	similarity_scores = expit(similarity_scores)	×
181
182	# Convert scores to list of floats from numpy array
183	similarity_scores = similarity_scores.tolist()	×
184
185	else:
186	# For Bi-encoders we create embeddings separately for predictions and labels
187	predictions_embeddings = self._similarity_model.encode(	×
188	predicted_answers, batch_size=self._batch_size, convert_to_tensor=True
189	)
190	label_embeddings = self._similarity_model.encode(	×
191	ground_truth_answers, batch_size=self._batch_size, convert_to_tensor=True
192	)
193
194	# Compute cosine-similarities
195	similarity_scores = [	×
196	float(util.cos_sim(p, l).cpu().numpy()) for p, l in zip(predictions_embeddings, label_embeddings)
197	]
198
199	sas_score = np_mean(similarity_scores)	×
200
201	return {"score": sas_score, "individual_scores": similarity_scores}	×

deepset-ai / haystack / 12240140835

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous