• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 12240140835

09 Dec 2024 04:39PM UTC coverage: 90.335% (+0.001%) from 90.334%
12240140835

Pull #8610

github

web-flow
Merge 3ff0aa0e9 into 6f983a22c
Pull Request #8610: chore: fixing `pylint` issues

8038 of 8898 relevant lines covered (90.33%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

63.33
haystack/components/evaluators/sas_evaluator.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
from typing import Any, Dict, List, Optional
1✔
6

7
from numpy import mean as np_mean
1✔
8

9
from haystack import component, default_from_dict, default_to_dict
1✔
10
from haystack.lazy_imports import LazyImport
1✔
11
from haystack.utils import ComponentDevice, expit
1✔
12
from haystack.utils.auth import Secret, deserialize_secrets_inplace
1✔
13

14
with LazyImport(message="Run 'pip install \"sentence-transformers>=3.0.0\"'") as sas_import:
1✔
15
    from sentence_transformers import CrossEncoder, SentenceTransformer, util
1✔
16
    from transformers import AutoConfig
1✔
17

18

19
@component
1✔
20
class SASEvaluator:
1✔
21
    """
22
    SASEvaluator computes the Semantic Answer Similarity (SAS) between a list of predictions and a one of ground truths.
23

24
    It's usually used in Retrieval Augmented Generation (RAG) pipelines to evaluate the quality of the generated
25
    answers. The SAS is computed using a pre-trained model from the Hugging Face model hub. The model can be either a
26
    Bi-Encoder or a Cross-Encoder. The choice of the model is based on the `model` parameter.
27

28
    Usage example:
29
    ```python
30
    from haystack.components.evaluators.sas_evaluator import SASEvaluator
31

32
    evaluator = SASEvaluator(model="cross-encoder/ms-marco-MiniLM-L-6-v2")
33
    evaluator.warm_up()
34
    ground_truths = [
35
        "A construction budget of US $2.3 billion",
36
        "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
37
        "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
38
    ]
39
    predictions = [
40
        "A construction budget of US $2.3 billion",
41
        "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
42
        "The Meiji Restoration in 1868 transformed Japan into a modernized world power.",
43
    ]
44
    result = evaluator.run(
45
        ground_truths_answers=ground_truths, predicted_answers=predictions
46
    )
47

48
    print(result["score"])
49
    # 0.9999673763910929
50

51
    print(result["individual_scores"])
52
    # [0.9999765157699585, 0.999968409538269, 0.9999572038650513]
53
    ```
54
    """
55

56
    def __init__(
1✔
57
        self,
58
        model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
59
        batch_size: int = 32,
60
        device: Optional[ComponentDevice] = None,
61
        token: Secret = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
62
    ):
63
        """
64
        Creates a new instance of SASEvaluator.
65

66
        :param model:
67
            SentenceTransformers semantic textual similarity model, should be path or string pointing to a downloadable
68
            model.
69
        :param batch_size:
70
            Number of prediction-label pairs to encode at once.
71
        :param device:
72
            The device on which the model is loaded. If `None`, the default device is automatically selected.
73
        :param token:
74
            The Hugging Face token for HTTP bearer authorization.
75
            You can find your HF token in your [account settings](https://huggingface.co/settings/tokens)
76
        """
77
        sas_import.check()
1✔
78

79
        self._model = model
1✔
80
        self._batch_size = batch_size
1✔
81
        self._device = device
1✔
82
        self._token = token
1✔
83
        self._similarity_model = None
1✔
84

85
    def to_dict(self) -> Dict[str, Any]:
1✔
86
        """
87
        Serialize this component to a dictionary.
88

89
        :returns:
90
            The serialized component as a dictionary.
91
        """
92
        return default_to_dict(
1✔
93
            self,
94
            model=self._model,
95
            batch_size=self._batch_size,
96
            device=self._device.to_dict() if self._device else None,
97
            token=self._token.to_dict() if self._token else None,
98
        )
99

100
    @classmethod
1✔
101
    def from_dict(cls, data: Dict[str, Any]) -> "SASEvaluator":
1✔
102
        """
103
        Deserialize this component from a dictionary.
104

105
        :param data:
106
            The dictionary representation of this component.
107
        :returns:
108
            The deserialized component instance.
109
        """
110
        deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
1✔
111
        if device := data.get("init_parameters", {}).get("device"):
1✔
112
            data["init_parameters"]["device"] = ComponentDevice.from_dict(device)
1✔
113
        return default_from_dict(cls, data)
1✔
114

115
    def warm_up(self):
1✔
116
        """
117
        Initializes the component.
118
        """
119
        if self._similarity_model:
×
120
            return
×
121

122
        token = self._token.resolve_value() if self._token else None
×
123
        config = AutoConfig.from_pretrained(self._model, use_auth_token=token)
×
124
        cross_encoder_used = False
×
125
        if config.architectures:
×
126
            cross_encoder_used = any(arch.endswith("ForSequenceClassification") for arch in config.architectures)
×
127
        device = ComponentDevice.resolve_device(self._device).to_torch_str()
×
128
        # Based on the Model string we can load either Bi-Encoders or Cross Encoders.
129
        # Similarity computation changes for both approaches
130
        if cross_encoder_used:
×
131
            self._similarity_model = CrossEncoder(
×
132
                self._model,
133
                device=device,
134
                tokenizer_args={"use_auth_token": token},
135
                automodel_args={"use_auth_token": token},
136
            )
137
        else:
138
            self._similarity_model = SentenceTransformer(self._model, device=device, use_auth_token=token)
×
139

140
    @component.output_types(score=float, individual_scores=List[float])
1✔
141
    def run(self, ground_truth_answers: List[str], predicted_answers: List[str]) -> Dict[str, Any]:
1✔
142
        """
143
        SASEvaluator component run method.
144

145
        Run the SASEvaluator to compute the Semantic Answer Similarity (SAS) between a list of predicted answers
146
        and a list of ground truth answers. Both must be list of strings of same length.
147

148
        :param ground_truth_answers:
149
            A list of expected answers for each question.
150
        :param predicted_answers:
151
            A list of generated answers for each question.
152
        :returns:
153
            A dictionary with the following outputs:
154
                - `score`: Mean SAS score over all the predictions/ground-truth pairs.
155
                - `individual_scores`: A list of similarity scores for each prediction/ground-truth pair.
156
        """
157
        if len(ground_truth_answers) != len(predicted_answers):
1✔
158
            raise ValueError("The number of predictions and labels must be the same.")
1✔
159

160
        if any(answer is None for answer in predicted_answers):
1✔
161
            raise ValueError("Predicted answers must not contain None values.")
1✔
162

163
        if len(predicted_answers) == 0:
1✔
164
            return {"score": 0.0, "individual_scores": [0.0]}
1✔
165

166
        if not self._similarity_model:
1✔
167
            msg = "The model has not been initialized. Call warm_up() before running the evaluator."
1✔
168
            raise RuntimeError(msg)
1✔
169

170
        if isinstance(self._similarity_model, CrossEncoder):
×
171
            # For Cross Encoders we create a list of pairs of predictions and labels
172
            sentence_pairs = list(zip(predicted_answers, ground_truth_answers))
×
173
            similarity_scores = self._similarity_model.predict(
×
174
                sentence_pairs, batch_size=self._batch_size, convert_to_numpy=True
175
            )
176

177
            # All Cross Encoders do not return a set of logits scores that are normalized
178
            # We normalize scores if they are larger than 1
179
            if (similarity_scores > 1).any():
×
180
                similarity_scores = expit(similarity_scores)
×
181

182
            # Convert scores to list of floats from numpy array
183
            similarity_scores = similarity_scores.tolist()
×
184

185
        else:
186
            # For Bi-encoders we create embeddings separately for predictions and labels
187
            predictions_embeddings = self._similarity_model.encode(
×
188
                predicted_answers, batch_size=self._batch_size, convert_to_tensor=True
189
            )
190
            label_embeddings = self._similarity_model.encode(
×
191
                ground_truth_answers, batch_size=self._batch_size, convert_to_tensor=True
192
            )
193

194
            # Compute cosine-similarities
195
            similarity_scores = [
×
196
                float(util.cos_sim(p, l).cpu().numpy()) for p, l in zip(predictions_embeddings, label_embeddings)
197
            ]
198

199
        sas_score = np_mean(similarity_scores)
×
200

201
        return {"score": sas_score, "individual_scores": similarity_scores}
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc