9568249476

Committed 18 Jun 2024 03:52PM UTC coverage: 89.872% (-0.1%) from 89.995%

Build # 9568249476

Build Type

push

github

Committed by

web-flow

Commit Message

ci: Add code formatting checks  (#7882)

* ruff settings

enable ruff format and re-format outdated files

feat: `EvaluationRunResult` add parameter to specify columns to keep in the comparative `Dataframe`  (#7879)

* adding param to explictily state which cols to keep

* adding param to explictily state which cols to keep

* adding param to explictily state which cols to keep

* updating tests

* adding release notes

* Update haystack/evaluation/eval_run_result.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update releasenotes/notes/add-keep-columns-to-EvalRunResult-comparative-be3e15ce45de3e0b.yaml

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* updating docstring

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

add format-check

fail on format and linting failures

fix string formatting

reformat long lines

fix tests

fix typing

linter

pull from main

* reformat

* lint -> check

* lint -> check

Run Details

6957 of 7741 relevant lines covered (89.87%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.86

haystack/components/evaluators/document_map.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from typing import Any, Dict, List

from haystack import Document, component


@component
class DocumentMAPEvaluator:
    """
    A Mean Average Precision (MAP) evaluator for documents.

    Evaluator that calculates the mean average precision of the retrieved documents, a metric
    that measures how high retrieved documents are ranked.
    Each question can have multiple ground truth documents and multiple retrieved documents.

    `DocumentMAPEvaluator` doesn't normalize its inputs, the `DocumentCleaner` component
    should be used to clean and normalize the documents before passing them to this evaluator.

    Usage example:
    ```python
    from haystack import Document
    from haystack.components.evaluators import DocumentMAPEvaluator

    evaluator = DocumentMAPEvaluator()
    result = evaluator.run(
        ground_truth_documents=[
            [Document(content="France")],
            [Document(content="9th century"), Document(content="9th")],
        ],
        retrieved_documents=[
            [Document(content="France")],
            [Document(content="9th century"), Document(content="10th century"), Document(content="9th")],
        ],
    )

    print(result["individual_scores"])
    # [1.0, 0.8333333333333333]
    print(result["score"])
    # 0.9166666666666666
    ```
    """

    @component.output_types(score=float, individual_scores=List[float])
    def run(
        self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]]
    ) -> Dict[str, Any]:
        """
        Run the DocumentMAPEvaluator on the given inputs.

        All lists must have the same length.

        :param ground_truth_documents:
            A list of expected documents for each question.
        :param retrieved_documents:
            A list of retrieved documents for each question.
        :returns:
            A dictionary with the following outputs:
            - `score` - The average of calculated scores.
            - `individual_scores` - A list of numbers from 0.0 to 1.0 that represents how high retrieved documents
                are ranked.
        """
        if len(ground_truth_documents) != len(retrieved_documents):
            msg = "The length of ground_truth_documents and retrieved_documents must be the same."
            raise ValueError(msg)

        individual_scores = []

        for ground_truth, retrieved in zip(ground_truth_documents, retrieved_documents):
            score = 0.0
            for ground_document in ground_truth:
                if ground_document.content is None:
                    continue

                average_precision = 0.0
                relevant_documents = 0

                for rank, retrieved_document in enumerate(retrieved):
                    if retrieved_document.content is None:
                        continue

                    if ground_document.content in retrieved_document.content:
                        relevant_documents += 1
                        average_precision += relevant_documents / (rank + 1)
                if relevant_documents > 0:
                    score = average_precision / relevant_documents
            individual_scores.append(score)

        score = sum(individual_scores) / len(retrieved_documents)

        return {"score": score, "individual_scores": individual_scores}

1	# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2	#
3	# SPDX-License-Identifier: Apache-2.0
4
5	from typing import Any, Dict, List	1✔
6
7	from haystack import Document, component	1✔
8
9
10	@component	1✔
11	class DocumentMAPEvaluator:	1✔
12	"""
13	A Mean Average Precision (MAP) evaluator for documents.
14
15	Evaluator that calculates the mean average precision of the retrieved documents, a metric
16	that measures how high retrieved documents are ranked.
17	Each question can have multiple ground truth documents and multiple retrieved documents.
18
19	`DocumentMAPEvaluator` doesn't normalize its inputs, the `DocumentCleaner` component
20	should be used to clean and normalize the documents before passing them to this evaluator.
21
22	Usage example:
23	```python
24	from haystack import Document
25	from haystack.components.evaluators import DocumentMAPEvaluator
26
27	evaluator = DocumentMAPEvaluator()
28	result = evaluator.run(
29	ground_truth_documents=[
30	[Document(content="France")],
31	[Document(content="9th century"), Document(content="9th")],
32	],
33	retrieved_documents=[
34	[Document(content="France")],
35	[Document(content="9th century"), Document(content="10th century"), Document(content="9th")],
36	],
37	)
38
39	print(result["individual_scores"])
40	# [1.0, 0.8333333333333333]
41	print(result["score"])
42	# 0.9166666666666666
43	```
44	"""
45
46	@component.output_types(score=float, individual_scores=List[float])	1✔
47	def run(	1✔
48	self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]]
49	) -> Dict[str, Any]:
50	"""
51	Run the DocumentMAPEvaluator on the given inputs.
52
53	All lists must have the same length.
54
55	:param ground_truth_documents:
56	A list of expected documents for each question.
57	:param retrieved_documents:
58	A list of retrieved documents for each question.
59	:returns:
60	A dictionary with the following outputs:
61	- `score` - The average of calculated scores.
62	- `individual_scores` - A list of numbers from 0.0 to 1.0 that represents how high retrieved documents
63	are ranked.
64	"""
65	if len(ground_truth_documents) != len(retrieved_documents):	1✔
66	msg = "The length of ground_truth_documents and retrieved_documents must be the same."	1✔
67	raise ValueError(msg)	1✔
68
69	individual_scores = []	1✔
70
71	for ground_truth, retrieved in zip(ground_truth_documents, retrieved_documents):	1✔
72	score = 0.0	1✔
73	for ground_document in ground_truth:	1✔
74	if ground_document.content is None:	1✔
75	continue	×
76
77	average_precision = 0.0	1✔
78	relevant_documents = 0	1✔
79
80	for rank, retrieved_document in enumerate(retrieved):	1✔
81	if retrieved_document.content is None:	1✔
82	continue	×
83
84	if ground_document.content in retrieved_document.content:	1✔
85	relevant_documents += 1	1✔
86	average_precision += relevant_documents / (rank + 1)	1✔
87	if relevant_documents > 0:	1✔
88	score = average_precision / relevant_documents	1✔
89	individual_scores.append(score)	1✔
90
91	score = sum(individual_scores) / len(retrieved_documents)	1✔
92
93	return {"score": score, "individual_scores": individual_scores}	1✔

deepset-ai / haystack / 9568249476

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous