• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 14404479415

11 Apr 2025 01:28PM UTC coverage: 90.316% (-0.01%) from 90.329%
14404479415

Pull #9219

github

web-flow
Merge a68bbd1de into 8bf41a851
Pull Request #9219: chore: LLM Evaluators - remove deprecated parameters

10660 of 11803 relevant lines covered (90.32%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.78
haystack/components/evaluators/llm_evaluator.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
import json
1✔
6
from typing import Any, Dict, List, Optional, Tuple, Type
1✔
7

8
from tqdm import tqdm
1✔
9

10
from haystack import component, default_from_dict, default_to_dict, logging
1✔
11
from haystack.components.builders import PromptBuilder
1✔
12
from haystack.components.generators.chat.openai import OpenAIChatGenerator
1✔
13
from haystack.components.generators.chat.types import ChatGenerator
1✔
14
from haystack.core.serialization import component_to_dict
1✔
15
from haystack.dataclasses.chat_message import ChatMessage
1✔
16
from haystack.utils import deserialize_chatgenerator_inplace, deserialize_type, serialize_type
1✔
17

18
logger = logging.getLogger(__name__)
1✔
19

20

21
@component
1✔
22
class LLMEvaluator:
1✔
23
    """
24
    Uses an LLM to evaluate inputs based on a prompt containing instructions and examples.
25

26
    The default API requires an OpenAI API key to be provided as an environment variable "OPENAI_API_KEY".
27
    The inputs are lists that are user-defined depending on the desired metric.
28
    The output is a dictionary with a key `results` containing a list of evaluation results.
29
    Each result is a dictionary with user-defined keys and values of either 0 for FALSE or 1 for TRUE respectively.
30

31
    Usage example:
32
    ```python
33
    from typing import List
34
    from haystack.components.evaluators import LLMEvaluator
35
    evaluator = LLMEvaluator(
36
        instructions="Is this answer problematic for children?",
37
        inputs=[("predicted_answers", List[str])],
38
        outputs=["score"],
39
        examples=[
40
            {"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
41
            {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}},
42
        ],
43
    )
44
    predicted_answers = [
45
        "Football is the most popular sport with around 4 billion followers worldwide",
46
        "Python language was created by Guido van Rossum.",
47
    ]
48
    results = evaluator.run(predicted_answers=predicted_answers)
49
    print(results)
50
    # {'results': [{'score': 0}, {'score': 0}]}
51
    ```
52
    """
53

54
    def __init__(  # pylint: disable=too-many-positional-arguments
1✔
55
        self,
56
        instructions: str,
57
        inputs: List[Tuple[str, Type[List]]],
58
        outputs: List[str],
59
        examples: List[Dict[str, Any]],
60
        progress_bar: bool = True,
61
        *,
62
        raise_on_failure: bool = True,
63
        chat_generator: Optional[ChatGenerator] = None,
64
    ):
65
        """
66
        Creates an instance of LLMEvaluator.
67

68
        If no LLM is specified using the `chat_generator` parameter, the component will use OpenAI in JSON mode.
69

70
        :param instructions:
71
            The prompt instructions to use for evaluation.
72
            Should be a question about the inputs that can be answered with yes or no.
73
        :param inputs:
74
            The inputs that the component expects as incoming connections and that it evaluates.
75
            Each input is a tuple of an input name and input type. Input types must be lists.
76
        :param outputs:
77
            Output names of the evaluation results. They correspond to keys in the output dictionary.
78
        :param examples:
79
            Few-shot examples conforming to the expected input and output format as defined in the `inputs` and
80
             `outputs` parameters.
81
            Each example is a dictionary with keys "inputs" and "outputs"
82
            They contain the input and output as dictionaries respectively.
83
        :param raise_on_failure:
84
            If True, the component will raise an exception on an unsuccessful API call.
85
        :param progress_bar:
86
            Whether to show a progress bar during the evaluation.
87
        :param chat_generator:
88
            a ChatGenerator instance which represents the LLM.
89
            In order for the component to work, the LLM should be configured to return a JSON object. For example,
90
            when using the OpenAIChatGenerator, you should pass `{"response_format": {"type": "json_object"}}` in the
91
            `generation_kwargs`.
92
        """
93
        self.validate_init_parameters(inputs, outputs, examples)
1✔
94
        component.set_input_types(self, **dict(inputs))
1✔
95

96
        self.raise_on_failure = raise_on_failure
1✔
97
        self.instructions = instructions
1✔
98
        self.inputs = inputs
1✔
99
        self.outputs = outputs
1✔
100
        self.examples = examples
1✔
101
        self.progress_bar = progress_bar
1✔
102

103
        template = self.prepare_template()
1✔
104
        self.builder = PromptBuilder(template=template)
1✔
105

106
        if chat_generator is not None:
1✔
107
            self._chat_generator = chat_generator
1✔
108
        else:
109
            generation_kwargs = {"response_format": {"type": "json_object"}, "seed": 42}
1✔
110
            self._chat_generator = OpenAIChatGenerator(generation_kwargs=generation_kwargs)
1✔
111

112
    @staticmethod
1✔
113
    def validate_init_parameters(
1✔
114
        inputs: List[Tuple[str, Type[List]]], outputs: List[str], examples: List[Dict[str, Any]]
115
    ):
116
        """
117
        Validate the init parameters.
118

119
        :param inputs:
120
            The inputs to validate.
121
        :param outputs:
122
            The outputs to validate.
123
        :param examples:
124
            The examples to validate.
125

126
        :raises ValueError:
127
            If the inputs are not a list of tuples with a string and a type of list.
128
            If the outputs are not a list of strings.
129
            If the examples are not a list of dictionaries.
130
            If any example does not have keys "inputs" and "outputs" with values that are dictionaries with string keys.
131
        """
132
        # Validate inputs
133
        if (
1✔
134
            not isinstance(inputs, list)
135
            or not all(isinstance(_input, tuple) for _input in inputs)
136
            or not all(isinstance(_input[0], str) and _input[1] is not list and len(_input) == 2 for _input in inputs)
137
        ):
138
            msg = (
1✔
139
                f"LLM evaluator expects inputs to be a list of tuples. Each tuple must contain an input name and "
140
                f"type of list but received {inputs}."
141
            )
142
            raise ValueError(msg)
1✔
143

144
        # Validate outputs
145
        if not isinstance(outputs, list) or not all(isinstance(output, str) for output in outputs):
1✔
146
            msg = f"LLM evaluator expects outputs to be a list of str but received {outputs}."
1✔
147
            raise ValueError(msg)
1✔
148

149
        # Validate examples are lists of dicts
150
        if not isinstance(examples, list) or not all(isinstance(example, dict) for example in examples):
1✔
151
            msg = f"LLM evaluator expects examples to be a list of dictionaries but received {examples}."
1✔
152
            raise ValueError(msg)
1✔
153

154
        # Validate each example
155
        for example in examples:
1✔
156
            if (
1✔
157
                {"inputs", "outputs"} != example.keys()
158
                or not all(isinstance(example[param], dict) for param in ["inputs", "outputs"])
159
                or not all(isinstance(key, str) for param in ["inputs", "outputs"] for key in example[param])
160
            ):
161
                msg = (
1✔
162
                    f"LLM evaluator expects each example to have keys `inputs` and `outputs` with values that are "
163
                    f"dictionaries with str keys but received {example}."
164
                )
165
                raise ValueError(msg)
1✔
166

167
    @component.output_types(results=List[Dict[str, Any]])
1✔
168
    def run(self, **inputs) -> Dict[str, Any]:
1✔
169
        """
170
        Run the LLM evaluator.
171

172
        :param inputs:
173
            The input values to evaluate. The keys are the input names and the values are lists of input values.
174
        :returns:
175
            A dictionary with a `results` entry that contains a list of results.
176
            Each result is a dictionary containing the keys as defined in the `outputs` parameter of the LLMEvaluator
177
            and the evaluation results as the values. If an exception occurs for a particular input value, the result
178
            will be `None` for that entry.
179
            If the API is "openai" and the response contains a "meta" key, the metadata from OpenAI will be included
180
            in the output dictionary, under the key "meta".
181
        :raises ValueError:
182
            Only in the case that  `raise_on_failure` is set to True and the received inputs are not lists or have
183
            different lengths, or if the output is not a valid JSON or doesn't contain the expected keys.
184
        """
185
        self.validate_input_parameters(dict(self.inputs), inputs)
1✔
186

187
        # inputs is a dictionary with keys being input names and values being a list of input values
188
        # We need to iterate through the lists in parallel for all keys of the dictionary
189
        input_names, values = inputs.keys(), list(zip(*inputs.values()))
1✔
190
        list_of_input_names_to_values = [dict(zip(input_names, v)) for v in values]
1✔
191

192
        results: List[Optional[Dict[str, Any]]] = []
1✔
193
        metadata = []
1✔
194
        errors = 0
1✔
195
        for input_names_to_values in tqdm(list_of_input_names_to_values, disable=not self.progress_bar):
1✔
196
            prompt = self.builder.run(**input_names_to_values)
1✔
197
            messages = [ChatMessage.from_user(prompt["prompt"])]
1✔
198
            try:
1✔
199
                result = self._chat_generator.run(messages=messages)
1✔
200
            except Exception as e:
1✔
201
                if self.raise_on_failure:
1✔
202
                    raise ValueError(f"Error while generating response for prompt: {prompt}. Error: {e}")
×
203
                logger.warning("Error while generating response for prompt: {prompt}. Error: {e}", prompt=prompt, e=e)
1✔
204
                results.append(None)
1✔
205
                errors += 1
1✔
206
                continue
1✔
207

208
            if self.is_valid_json_and_has_expected_keys(expected=self.outputs, received=result["replies"][0].text):
1✔
209
                parsed_result = json.loads(result["replies"][0].text)
1✔
210
                results.append(parsed_result)
1✔
211
            else:
212
                results.append(None)
×
213
                errors += 1
×
214

215
            if result["replies"][0].meta:
1✔
216
                metadata.append(result["replies"][0].meta)
×
217

218
        if errors > 0:
1✔
219
            logger.warning(
1✔
220
                "LLM evaluator failed for {errors} out of {len(list_of_input_names_to_values)} inputs.",
221
                errors=errors,
222
                len=len(list_of_input_names_to_values),
223
            )
224

225
        return {"results": results, "meta": metadata or None}
1✔
226

227
    def prepare_template(self) -> str:
1✔
228
        """
229
        Prepare the prompt template.
230

231
        Combine instructions, inputs, outputs, and examples into one prompt template with the following format:
232
        Instructions:
233
        <instructions>
234

235
        Generate the response in JSON format with the following keys:
236
        <list of output keys>
237
        Consider the instructions and the examples below to determine those values.
238

239
        Examples:
240
        <examples>
241

242
        Inputs:
243
        <inputs>
244
        Outputs:
245

246
        :returns:
247
            The prompt template.
248
        """
249
        inputs_section = (
1✔
250
            "{" + ", ".join([f'"{input_socket[0]}": {{{{ {input_socket[0]} }}}}' for input_socket in self.inputs]) + "}"
251
        )
252

253
        examples_section = "\n".join(
1✔
254
            [
255
                "Inputs:\n" + json.dumps(example["inputs"]) + "\nOutputs:\n" + json.dumps(example["outputs"])
256
                for example in self.examples
257
            ]
258
        )
259
        return (
1✔
260
            f"Instructions:\n"
261
            f"{self.instructions}\n\n"
262
            f"Generate the response in JSON format with the following keys:\n"
263
            f"{json.dumps(self.outputs)}\n"
264
            f"Consider the instructions and the examples below to determine those values.\n\n"
265
            f"Examples:\n"
266
            f"{examples_section}\n\n"
267
            f"Inputs:\n"
268
            f"{inputs_section}\n"
269
            f"Outputs:\n"
270
        )
271

272
    def to_dict(self) -> Dict[str, Any]:
1✔
273
        """
274
        Serialize this component to a dictionary.
275

276
        :returns:
277
            The serialized component as a dictionary.
278
        """
279
        # Since we cannot currently serialize tuples, convert the inputs to a list.
280
        inputs = [[name, serialize_type(type_)] for name, type_ in self.inputs]
1✔
281
        return default_to_dict(
1✔
282
            self,
283
            instructions=self.instructions,
284
            inputs=inputs,
285
            outputs=self.outputs,
286
            examples=self.examples,
287
            chat_generator=component_to_dict(obj=self._chat_generator, name="chat_generator"),
288
            progress_bar=self.progress_bar,
289
        )
290

291
    @classmethod
1✔
292
    def from_dict(cls, data: Dict[str, Any]) -> "LLMEvaluator":
1✔
293
        """
294
        Deserialize this component from a dictionary.
295

296
        :param data:
297
            The dictionary representation of this component.
298
        :returns:
299
            The deserialized component instance.
300
        """
301
        data["init_parameters"]["inputs"] = [
1✔
302
            (name, deserialize_type(type_)) for name, type_ in data["init_parameters"]["inputs"]
303
        ]
304

305
        if data["init_parameters"].get("chat_generator"):
1✔
306
            deserialize_chatgenerator_inplace(data["init_parameters"], key="chat_generator")
1✔
307

308
        return default_from_dict(cls, data)
1✔
309

310
    @staticmethod
1✔
311
    def validate_input_parameters(expected: Dict[str, Any], received: Dict[str, Any]) -> None:
1✔
312
        """
313
        Validate the input parameters.
314

315
        :param expected:
316
            The expected input parameters.
317
        :param received:
318
            The received input parameters.
319

320
        :raises ValueError:
321
            If not all expected inputs are present in the received inputs
322
            If the received inputs are not lists or have different lengths
323
        """
324
        # Validate that all expected inputs are present in the received inputs
325
        for param in expected.keys():
1✔
326
            if param not in received:
1✔
327
                msg = f"LLM evaluator expected input parameter '{param}' but received only {received.keys()}."
1✔
328
                raise ValueError(msg)
1✔
329

330
        # Validate that all received inputs are lists
331
        if not all(isinstance(_input, list) for _input in received.values()):
1✔
332
            msg = (
1✔
333
                "LLM evaluator expects all input values to be lists but received "
334
                f"{[type(_input) for _input in received.values()]}."
335
            )
336
            raise ValueError(msg)
1✔
337

338
        # Validate that all received inputs are of the same length
339
        inputs = received.values()
1✔
340
        length = len(next(iter(inputs)))
1✔
341
        if not all(len(_input) == length for _input in inputs):
1✔
342
            msg = (
1✔
343
                f"LLM evaluator expects all input lists to have the same length but received {inputs} with lengths "
344
                f"{[len(_input) for _input in inputs]}."
345
            )
346
            raise ValueError(msg)
1✔
347

348
    def is_valid_json_and_has_expected_keys(self, expected: List[str], received: str) -> bool:
1✔
349
        """
350
        Output must be a valid JSON with the expected keys.
351

352
        :param expected:
353
            Names of expected outputs
354
        :param received:
355
            Names of received outputs
356

357
        :raises ValueError:
358
            If the output is not a valid JSON with the expected keys:
359
            - with `raise_on_failure` set to True a ValueError is raised.
360
            - with `raise_on_failure` set to False a warning is issued and False is returned.
361

362
        :returns:
363
            True if the received output is a valid JSON with the expected keys, False otherwise.
364
        """
365
        try:
1✔
366
            parsed_output = json.loads(received)
1✔
367
        except json.JSONDecodeError:
1✔
368
            msg = "Response from LLM evaluator is not a valid JSON."
1✔
369
            if self.raise_on_failure:
1✔
370
                raise ValueError(msg)
1✔
371
            logger.warning(msg)
1✔
372
            return False
1✔
373

374
        if not all(output in parsed_output for output in expected):
1✔
375
            if self.raise_on_failure:
1✔
376
                raise ValueError(
1✔
377
                    f"Expected response from LLM evaluator to be JSON with keys {expected}, got {{received}}."
378
                )
379
            logger.warning(
×
380
                "Expected response from LLM evaluator to be JSON with keys {expected}, got {received}.",
381
                expected=expected,
382
                received=received,
383
            )
384
            return False
×
385

386
        return True
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc