5646559600

pending completion

Build # 5646559600

Build Type

push

github

Committed by

web-flow

Commit Message

test: reactivate unit tests in `test_eval.py` (#5255)

* Activate tests that follow unit test and integration test rules

* Adding more integration labels

* Change name to better reflect complexity of test

* Remove mark integration tags, move test to doc store test for add_eval_data

* Removing incorrect integration label

* Deactivated document store test b/c it fails for Weaviate and pinecone

* Remove unit label since test needs to be refactored to be considered a unit test

* Undo changes

* Undo change

* Check every field in the load evaluation result

* Add back label and add skip reason

* Use pytest skip instead of TODO

Run Details

10748 of 23272 relevant lines covered (46.18%)

2.57 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

33.99

haystack/testing/document_store.py

# pylint: disable=too-many-public-methods
import sys

import pytest
import numpy as np

from haystack.schema import Document, Label, Answer, Span
from haystack.errors import DuplicateDocumentError
from haystack.document_stores import BaseDocumentStore


@pytest.mark.document_store
class DocumentStoreBaseTestAbstract:
    """
    This is a base class to test abstract methods from DocumentStoreBase to be inherited by any Document Store
    testsuite. It doesn't have the `Test` prefix in the name so that its methods won't be collected for this
    class but only for its subclasses.
    """

    @pytest.fixture
    def documents(self):
        documents = []
        for i in range(3):
            documents.append(
                Document(
                    content=f"A Foo Document {i}",
                    meta={"name": f"name_{i}", "year": "2020", "month": "01", "numbers": [2, 4]},
                    embedding=np.random.rand(768).astype(np.float32),
                )
            )

            documents.append(
                Document(
                    content=f"A Bar Document {i}",
                    meta={"name": f"name_{i}", "year": "2021", "month": "02", "numbers": [-2, -4]},
                    embedding=np.random.rand(768).astype(np.float32),
                )
            )

            documents.append(
                Document(
                    content=f"Document {i} without embeddings",
                    meta={"name": f"name_{i}", "no_embedding": True, "month": "03"},
                )
            )

        return documents

    @pytest.fixture
    def labels(self, documents):
        labels = []
        for i, d in enumerate(documents):
            labels.append(
                Label(
                    query=f"query_{i}",
                    document=d,
                    is_correct_document=True,
                    is_correct_answer=False,
                    # create a mix set of labels
                    origin="user-feedback" if i % 2 else "gold-label",
                    answer=None if not i else Answer(f"the answer is {i}", document_ids=[d.id]),
                    meta={"name": f"label_{i}", "year": f"{2020 + i}"},
                )
            )
        return labels

    #
    # Integration tests
    #

    @pytest.mark.integration
    def test_write_documents(self, ds, documents):
        ds.write_documents(documents)
        docs = ds.get_all_documents()
        assert len(docs) == len(documents)
        expected_ids = set(doc.id for doc in documents)
        ids = set(doc.id for doc in docs)
        assert ids == expected_ids

    @pytest.mark.integration
    def test_write_labels(self, ds, labels):
        ds.write_labels(labels)
        assert ds.get_all_labels() == labels

    @pytest.mark.integration
    def test_write_with_duplicate_doc_ids(self, ds):
        duplicate_documents = [
            Document(content="Doc1", id_hash_keys=["content"], meta={"key1": "value1"}),
            Document(content="Doc1", id_hash_keys=["content"], meta={"key1": "value1"}),
        ]
        ds.write_documents(duplicate_documents, duplicate_documents="skip")
        results = ds.get_all_documents()
        assert len(results) == 1
        assert results[0] == duplicate_documents[0]
        with pytest.raises(Exception):
            ds.write_documents(duplicate_documents, duplicate_documents="fail")

    @pytest.mark.integration
    def test_get_embedding_count(self, ds, documents):
        """
        We expect 6 docs with embeddings because only 6 documents in the documents fixture for this class contain
        embeddings.
        """
        ds.write_documents(documents)
        assert ds.get_embedding_count() == 6

    @pytest.mark.skip
    @pytest.mark.integration
    def test_get_all_documents_without_filters(self, ds, documents):
        ds.write_documents(documents)
        out = ds.get_all_documents()
        assert out == documents

    @pytest.mark.integration
    def test_get_all_documents_without_embeddings(self, ds, documents):
        ds.write_documents(documents)
        out = ds.get_all_documents(return_embedding=False)
        for doc in out:
            assert doc.embedding is None

    @pytest.mark.integration
    def test_get_all_document_filter_duplicate_text_value(self, ds):
        documents = [
            Document(content="duplicated", meta={"meta_field": "0"}, id_hash_keys=["meta"]),
            Document(content="duplicated", meta={"meta_field": "1", "name": "file.txt"}, id_hash_keys=["meta"]),
            Document(content="Doc2", meta={"name": "file_2.txt"}, id_hash_keys=["meta"]),
        ]
        ds.write_documents(documents)
        documents = ds.get_all_documents(filters={"meta_field": ["1"]})
        assert len(documents) == 1
        assert documents[0].content == "duplicated"
        assert documents[0].meta["name"] == "file.txt"

        documents = ds.get_all_documents(filters={"meta_field": ["0"]})
        assert len(documents) == 1
        assert documents[0].content == "duplicated"
        assert documents[0].meta.get("name") is None

        documents = ds.get_all_documents(filters={"name": ["file_2.txt"]})
        assert len(documents) == 1
        assert documents[0].content == "Doc2"
        assert documents[0].meta.get("meta_field") is None

    @pytest.mark.integration
    def test_get_all_documents_with_correct_filters(self, ds, documents):
        ds.write_documents(documents)
        result = ds.get_all_documents(filters={"year": ["2020"]})
        assert len(result) == 3

        documents = ds.get_all_documents(filters={"year": ["2020", "2021"]})
        assert len(documents) == 6

    @pytest.mark.integration
    def test_get_all_documents_with_incorrect_filter_name(self, ds, documents):
        ds.write_documents(documents)
        result = ds.get_all_documents(filters={"non_existing_meta_field": ["whatever"]})
        assert len(result) == 0

    @pytest.mark.integration
    def test_get_all_documents_with_incorrect_filter_value(self, ds, documents):
        ds.write_documents(documents)
        result = ds.get_all_documents(filters={"year": ["nope"]})
        assert len(result) == 0

    @pytest.mark.integration
    def test_eq_filters(self, ds, documents):
        ds.write_documents(documents)

        result = ds.get_all_documents(filters={"year": {"$eq": "2020"}})
        assert len(result) == 3
        result = ds.get_all_documents(filters={"year": "2020"})
        assert len(result) == 3

    @pytest.mark.integration
    def test_in_filters(self, ds, documents):
        ds.write_documents(documents)

        result = ds.get_all_documents(filters={"year": {"$in": ["2020", "2021", "n.a."]}})
        assert len(result) == 6
        result = ds.get_all_documents(filters={"year": ["2020", "2021", "n.a."]})
        assert len(result) == 6

    @pytest.mark.integration
    def test_ne_filters(self, ds, documents):
        ds.write_documents(documents)

        result = ds.get_all_documents(filters={"year": {"$ne": "2020"}})
        assert len(result) == 6

    @pytest.mark.integration
    def test_nin_filters(self, ds, documents):
        ds.write_documents(documents)

        result = ds.get_all_documents(filters={"year": {"$nin": ["2020", "2021", "n.a."]}})
        assert len(result) == 3

    @pytest.mark.integration
    def test_comparison_filters(self, ds, documents):
        ds.write_documents(documents)

        result = ds.get_all_documents(filters={"numbers": {"$gt": 0.0}})
        assert len(result) == 3

        result = ds.get_all_documents(filters={"numbers": {"$gte": -2.0}})
        assert len(result) == 6

        result = ds.get_all_documents(filters={"numbers": {"$lt": 0.0}})
        assert len(result) == 3

        result = ds.get_all_documents(filters={"numbers": {"$lte": 2.0}})
        assert len(result) == 6

    @pytest.mark.integration
    def test_compound_filters(self, ds, documents):
        ds.write_documents(documents)

        result = ds.get_all_documents(filters={"year": {"$lte": "2021", "$gte": "2020"}})
        assert len(result) == 6

    @pytest.mark.integration
    def test_simplified_filters(self, ds, documents):
        ds.write_documents(documents)

        filters = {"$and": {"year": {"$lte": "2021", "$gte": "2020"}, "name": {"$in": ["name_0", "name_1"]}}}
        result = ds.get_all_documents(filters=filters)
        assert len(result) == 4

        filters_simplified = {"year": {"$lte": "2021", "$gte": "2020"}, "name": ["name_0", "name_1"]}
        result = ds.get_all_documents(filters=filters_simplified)
        assert len(result) == 4

    @pytest.mark.integration
    def test_nested_condition_filters(self, ds, documents):
        ds.write_documents(documents)
        filters = {
            "$and": {
                "year": {"$lte": "2021", "$gte": "2020"},
                "$or": {"name": {"$in": ["name_0", "name_1"]}, "numbers": {"$lt": 5.0}},
            }
        }
        result = ds.get_all_documents(filters=filters)
        assert len(result) == 6

        filters_simplified = {
            "year": {"$lte": "2021", "$gte": "2020"},
            "$or": {"name": {"$in": ["name_0", "name_2"]}, "numbers": {"$lt": 5.0}},
        }
        result = ds.get_all_documents(filters=filters_simplified)
        assert len(result) == 6

        filters = {
            "$and": {
                "year": {"$lte": "2021", "$gte": "2020"},
                "$or": {
                    "name": {"$in": ["name_0", "name_1"]},
                    "$and": {"numbers": {"$lt": 5.0}, "$not": {"month": {"$eq": "01"}}},
                },
            }
        }
        result = ds.get_all_documents(filters=filters)
        assert len(result) == 5

        filters_simplified = {
            "year": {"$lte": "2021", "$gte": "2020"},
            "$or": {"name": ["name_0", "name_1"], "$and": {"numbers": {"$lt": 5.0}, "$not": {"month": {"$eq": "01"}}}},
        }
        result = ds.get_all_documents(filters=filters_simplified)
        assert len(result) == 5

    @pytest.mark.integration
    def test_nested_condition_not_filters(self, ds, documents):
        """
        Test nested logical operations within "$not", important as we apply De Morgan's laws in WeaviateDocumentstore
        """
        ds.write_documents(documents)
        filters = {
            "$not": {
                "$or": {
                    "$and": {"numbers": {"$lt": 5.0}, "month": {"$ne": "01"}},
                    "$not": {"year": {"$lte": "2021", "$gte": "2020"}},
                }
            }
        }
        result = ds.get_all_documents(filters=filters)
        assert len(result) == 3

        docs_meta = result[0].meta["numbers"]
        assert [2, 4] == docs_meta

        # Test same logical operator twice on same level

        filters = {
            "$or": [
                {"$and": {"name": {"$in": ["name_0", "name_1"]}, "year": {"$gte": "2020"}}},
                {"$and": {"name": {"$in": ["name_0", "name_1"]}, "year": {"$lt": "2021"}}},
            ]
        }
        result = ds.get_all_documents(filters=filters)
        docs_meta = [doc.meta["name"] for doc in result]
        assert len(result) == 4
        assert "name_0" in docs_meta
        assert "name_2" not in docs_meta

    @pytest.mark.integration
    def test_get_document_by_id(self, ds, documents):
        ds.write_documents(documents)
        doc = ds.get_document_by_id(documents[0].id)
        assert doc.id == documents[0].id
        assert doc.content == documents[0].content

    @pytest.mark.integration
    def test_get_documents_by_id(self, ds, documents):
        ds.write_documents(documents)
        ids = [doc.id for doc in documents]
        result = {doc.id for doc in ds.get_documents_by_id(ids, batch_size=2)}
        assert set(ids) == result

    @pytest.mark.integration
    def test_get_document_count(self, ds, documents):
        ds.write_documents(documents)
        assert ds.get_document_count() == len(documents)
        assert ds.get_document_count(filters={"year": ["2020"]}) == 3
        assert ds.get_document_count(filters={"month": ["02"]}) == 3

    @pytest.mark.integration
    def test_get_all_documents_generator(self, ds, documents):
        ds.write_documents(documents)
        assert len(list(ds.get_all_documents_generator(batch_size=2))) == 9

    @pytest.mark.integration
    def test_duplicate_documents_skip(self, ds, documents):
        ds.write_documents(documents)

        updated_docs = []
        for d in documents:
            updated_d = Document.from_dict(d.to_dict())
            updated_d.meta["name"] = "Updated"
            updated_docs.append(updated_d)

        ds.write_documents(updated_docs, duplicate_documents="skip")
        for d in ds.get_all_documents():
            assert d.meta.get("name") != "Updated"

    @pytest.mark.integration
    def test_duplicate_documents_overwrite(self, ds, documents):
        ds.write_documents(documents)

        updated_docs = []
        for d in documents:
            updated_d = Document.from_dict(d.to_dict())
            updated_d.meta["name"] = "Updated"
            updated_docs.append(updated_d)

        ds.write_documents(updated_docs, duplicate_documents="overwrite")
        for doc in ds.get_all_documents():
            assert doc.meta["name"] == "Updated"

    @pytest.mark.integration
    def test_duplicate_documents_fail(self, ds, documents):
        ds.write_documents(documents)

        updated_docs = []
        for d in documents:
            updated_d = Document.from_dict(d.to_dict())
            updated_d.meta["name"] = "Updated"
            updated_docs.append(updated_d)

        with pytest.raises(DuplicateDocumentError):
            ds.write_documents(updated_docs, duplicate_documents="fail")

    @pytest.mark.integration
    def test_write_document_meta(self, ds):
        ds.write_documents(
            [
                {"content": "dict_without_meta", "id": "1"},
                {"content": "dict_with_meta", "meta_field": "test2", "id": "2"},
                Document(content="document_object_without_meta", id="3"),
                Document(content="document_object_with_meta", meta={"meta_field": "test4"}, id="4"),
            ]
        )
        assert not ds.get_document_by_id("1").meta
        assert ds.get_document_by_id("2").meta["meta_field"] == "test2"
        assert not ds.get_document_by_id("3").meta
        assert ds.get_document_by_id("4").meta["meta_field"] == "test4"

    @pytest.mark.integration
    def test_delete_documents(self, ds, documents):
        ds.write_documents(documents)
        ds.delete_documents()
        assert ds.get_document_count() == 0

    @pytest.mark.integration
    def test_delete_documents_with_filters(self, ds, documents):
        ds.write_documents(documents)
        ds.delete_documents(filters={"year": ["2020", "2021"]})
        documents = ds.get_all_documents()
        assert ds.get_document_count() == 3

    @pytest.mark.integration
    def test_delete_documents_by_id(self, ds, documents):
        ds.write_documents(documents)
        docs_to_delete = ds.get_all_documents(filters={"year": ["2020"]})
        ds.delete_documents(ids=[doc.id for doc in docs_to_delete])
        assert ds.get_document_count() == 6

    @pytest.mark.integration
    def test_delete_documents_by_id_with_filters(self, ds, documents):
        ds.write_documents(documents)
        docs_to_delete = ds.get_all_documents(filters={"year": ["2020"]})
        # this should delete only 1 document out of the 3 ids passed
        ds.delete_documents(ids=[doc.id for doc in docs_to_delete], filters={"name": ["name_0"]})
        assert ds.get_document_count() == 8

    @pytest.mark.integration
    def test_write_get_all_labels(self, ds, labels):
        ds.write_labels(labels)
        ds.write_labels(labels[:3], index="custom_index")
        assert len(ds.get_all_labels()) == 9
        assert len(ds.get_all_labels(index="custom_index")) == 3
        # remove the index we created in this test
        ds.delete_index("custom_index")

    @pytest.mark.integration
    def test_delete_labels(self, ds, labels):
        ds.write_labels(labels)
        ds.write_labels(labels[:3], index="custom_index")
        ds.delete_labels()
        ds.delete_labels(index="custom_index")
        assert len(ds.get_all_labels()) == 0
        assert len(ds.get_all_labels(index="custom_index")) == 0
        # remove the index we created in this test
        ds.delete_index("custom_index")

    @pytest.mark.integration
    def test_write_labels_duplicate(self, ds, labels):
        # create a duplicate
        dupe = Label.from_dict(labels[0].to_dict())

        ds.write_labels(labels + [dupe])

        # ensure the duplicate was discarded
        assert len(ds.get_all_labels()) == len(labels)

    @pytest.mark.integration
    def test_delete_labels_by_id(self, ds, labels):
        ds.write_labels(labels)
        ds.delete_labels(ids=[labels[0].id])
        assert len(ds.get_all_labels()) == len(labels) - 1

    @pytest.mark.integration
    def test_delete_labels_by_filter(self, ds, labels):
        ds.write_labels(labels)
        ds.delete_labels(filters={"query": "query_1"})
        assert len(ds.get_all_labels()) == len(labels) - 1

    @pytest.mark.integration
    def test_delete_labels_by_filter_id(self, ds, labels):
        ds.write_labels(labels)

        # ids and filters are ANDed, the following should have no effect
        ds.delete_labels(ids=[labels[0].id], filters={"query": "query_9"})
        assert len(ds.get_all_labels()) == len(labels)

        #
        ds.delete_labels(ids=[labels[0].id], filters={"query": "query_0"})
        assert len(ds.get_all_labels()) == len(labels) - 1

    @pytest.mark.integration
    def test_get_label_count(self, ds, labels):
        ds.write_labels(labels)
        assert ds.get_label_count() == len(labels)

    @pytest.mark.integration
    def test_delete_index(self, ds, documents):
        ds.write_documents(documents, index="custom_index")
        assert ds.get_document_count(index="custom_index") == len(documents)
        ds.delete_index(index="custom_index")
        with pytest.raises(Exception):
            ds.get_document_count(index="custom_index")

    @pytest.mark.integration
    def test_delete_index_does_not_raise_if_not_exists(self, ds):
        ds.delete_index(index="unknown_index")

    @pytest.mark.integration
    def test_update_meta(self, ds, documents):
        ds.write_documents(documents)
        doc = documents[0]
        ds.update_document_meta(doc.id, meta={"year": "2099", "month": "12"})
        doc = ds.get_document_by_id(doc.id)
        assert doc.meta["year"] == "2099"
        assert doc.meta["month"] == "12"

    @pytest.mark.integration
    def test_labels_with_long_texts(self, ds, documents):
        label = Label(
            query="question1",
            answer=Answer(
                answer="answer",
                type="extractive",
                score=0.0,
                context="something " * 10_000,
                offsets_in_document=[Span(start=12, end=14)],
                offsets_in_context=[Span(start=12, end=14)],
            ),
            is_correct_answer=True,
            is_correct_document=True,
            document=Document(content="something " * 10_000, id="123"),
            origin="gold-label",
        )
        ds.write_labels(labels=[label])
        labels = ds.get_all_labels()
        assert len(labels) == 1
        assert label == labels[0]

    @pytest.mark.integration
    @pytest.mark.skipif(sys.platform == "win32", reason="_get_documents_meta() fails with 'too many SQL variables'")
    def test_get_all_documents_large_quantities(self, ds):
        # Test to exclude situations like Weaviate not returning more than 100 docs by default
        #   https://github.com/deepset-ai/haystack/issues/1893
        docs_to_write = [
            {"meta": {"name": f"name_{i}"}, "content": f"text_{i}", "embedding": np.random.rand(768).astype(np.float32)}
            for i in range(1000)
        ]
        ds.write_documents(docs_to_write)
        documents = ds.get_all_documents()
        assert all(isinstance(d, Document) for d in documents)
        assert len(documents) == len(docs_to_write)

    @pytest.mark.integration
    def test_custom_embedding_field(self, ds):
        ds.embedding_field = "custom_embedding_field"
        doc_to_write = {"content": "test", "custom_embedding_field": np.random.rand(768).astype(np.float32)}
        ds.write_documents([doc_to_write])
        documents = ds.get_all_documents(return_embedding=True)
        assert len(documents) == 1
        assert documents[0].content == "test"
        # Some document stores normalize the embedding on save, let's just compare the length
        assert doc_to_write["custom_embedding_field"].shape == documents[0].embedding.shape

    @pytest.mark.skip(reason="This currently fails for Weaviate and Pinecone")
    @pytest.mark.integration
    @pytest.mark.parametrize("batch_size", [None, 20])
    def test_add_eval_data(self, ds, batch_size, samples_path):
        # add eval data (SQUAD format)
        ds.add_eval_data(
            filename=samples_path / "squad" / "small.json",
            doc_index=ds.index,
            label_index=ds.label_index,
            batch_size=batch_size,
        )
        assert ds.get_document_count() == 87
        assert ds.get_label_count() == 1214

    #
    # Unit tests
    #

    @pytest.mark.unit
    def test_normalize_embeddings_diff_shapes(self):
        VEC_1 = np.array([0.1, 0.2, 0.3], dtype="float32")
        BaseDocumentStore.normalize_embedding(VEC_1)
        assert np.linalg.norm(VEC_1) - 1 < 0.01

        VEC_1 = np.array([0.1, 0.2, 0.3], dtype="float32").reshape(1, -1)
        BaseDocumentStore.normalize_embedding(VEC_1)
        assert np.linalg.norm(VEC_1) - 1 < 0.01

1	# pylint: disable=too-many-public-methods
2	import sys	2✔
3
4	import pytest	2✔
5	import numpy as np	2✔
6
7	from haystack.schema import Document, Label, Answer, Span	2✔
8	from haystack.errors import DuplicateDocumentError	2✔
9	from haystack.document_stores import BaseDocumentStore	2✔
10
11
12	@pytest.mark.document_store	2✔
13	class DocumentStoreBaseTestAbstract:	2✔
14	"""
15	This is a base class to test abstract methods from DocumentStoreBase to be inherited by any Document Store
16	testsuite. It doesn't have the `Test` prefix in the name so that its methods won't be collected for this
17	class but only for its subclasses.
18	"""
19
20	@pytest.fixture	2✔
21	def documents(self):	2✔
22	documents = []	2✔
23	for i in range(3):	2✔
24	documents.append(	2✔
25	Document(
26	content=f"A Foo Document {i}",
27	meta={"name": f"name_{i}", "year": "2020", "month": "01", "numbers": [2, 4]},
28	embedding=np.random.rand(768).astype(np.float32),
29	)
30	)
31
32	documents.append(	2✔
33	Document(
34	content=f"A Bar Document {i}",
35	meta={"name": f"name_{i}", "year": "2021", "month": "02", "numbers": [-2, -4]},
36	embedding=np.random.rand(768).astype(np.float32),
37	)
38	)
39
40	documents.append(	2✔
41	Document(
42	content=f"Document {i} without embeddings",
43	meta={"name": f"name_{i}", "no_embedding": True, "month": "03"},
44	)
45	)
46
47	return documents	2✔
48
49	@pytest.fixture	2✔
50	def labels(self, documents):	2✔
51	labels = []	×
52	for i, d in enumerate(documents):	×
53	labels.append(	×
54	Label(
55	query=f"query_{i}",
56	document=d,
57	is_correct_document=True,
58	is_correct_answer=False,
59	# create a mix set of labels
60	origin="user-feedback" if i % 2 else "gold-label",
61	answer=None if not i else Answer(f"the answer is {i}", document_ids=[d.id]),
62	meta={"name": f"label_{i}", "year": f"{2020 + i}"},
63	)
64	)
65	return labels	×
66
67	#
68	# Integration tests
69	#
70
71	@pytest.mark.integration	2✔
72	def test_write_documents(self, ds, documents):	2✔
73	ds.write_documents(documents)	×
74	docs = ds.get_all_documents()	×
75	assert len(docs) == len(documents)	×
76	expected_ids = set(doc.id for doc in documents)	×
77	ids = set(doc.id for doc in docs)	×
78	assert ids == expected_ids	×
79
80	@pytest.mark.integration	2✔
81	def test_write_labels(self, ds, labels):	2✔
82	ds.write_labels(labels)	×
83	assert ds.get_all_labels() == labels	×
84
85	@pytest.mark.integration	2✔
86	def test_write_with_duplicate_doc_ids(self, ds):	2✔
87	duplicate_documents = [	×
88	Document(content="Doc1", id_hash_keys=["content"], meta={"key1": "value1"}),
89	Document(content="Doc1", id_hash_keys=["content"], meta={"key1": "value1"}),
90	]
91	ds.write_documents(duplicate_documents, duplicate_documents="skip")	×
92	results = ds.get_all_documents()	×
93	assert len(results) == 1	×
94	assert results[0] == duplicate_documents[0]	×
95	with pytest.raises(Exception):	×
96	ds.write_documents(duplicate_documents, duplicate_documents="fail")	×
97
98	@pytest.mark.integration	2✔
99	def test_get_embedding_count(self, ds, documents):	2✔
100	"""
101	We expect 6 docs with embeddings because only 6 documents in the documents fixture for this class contain
102	embeddings.
103	"""
104	ds.write_documents(documents)	×
105	assert ds.get_embedding_count() == 6	×
106
107	@pytest.mark.skip	2✔
108	@pytest.mark.integration	2✔
109	def test_get_all_documents_without_filters(self, ds, documents):	2✔
110	ds.write_documents(documents)	×
111	out = ds.get_all_documents()	×
112	assert out == documents	×
113
114	@pytest.mark.integration	2✔
115	def test_get_all_documents_without_embeddings(self, ds, documents):	2✔
116	ds.write_documents(documents)	×
117	out = ds.get_all_documents(return_embedding=False)	×
118	for doc in out:	×
119	assert doc.embedding is None	×
120
121	@pytest.mark.integration	2✔
122	def test_get_all_document_filter_duplicate_text_value(self, ds):	2✔
123	documents = [	×
124	Document(content="duplicated", meta={"meta_field": "0"}, id_hash_keys=["meta"]),
125	Document(content="duplicated", meta={"meta_field": "1", "name": "file.txt"}, id_hash_keys=["meta"]),
126	Document(content="Doc2", meta={"name": "file_2.txt"}, id_hash_keys=["meta"]),
127	]
128	ds.write_documents(documents)	×
129	documents = ds.get_all_documents(filters={"meta_field": ["1"]})	×
130	assert len(documents) == 1	×
131	assert documents[0].content == "duplicated"	×
132	assert documents[0].meta["name"] == "file.txt"	×
133
134	documents = ds.get_all_documents(filters={"meta_field": ["0"]})	×
135	assert len(documents) == 1	×
136	assert documents[0].content == "duplicated"	×
137	assert documents[0].meta.get("name") is None	×
138
139	documents = ds.get_all_documents(filters={"name": ["file_2.txt"]})	×
140	assert len(documents) == 1	×
141	assert documents[0].content == "Doc2"	×
142	assert documents[0].meta.get("meta_field") is None	×
143
144	@pytest.mark.integration	2✔
145	def test_get_all_documents_with_correct_filters(self, ds, documents):	2✔
146	ds.write_documents(documents)	×
147	result = ds.get_all_documents(filters={"year": ["2020"]})	×
148	assert len(result) == 3	×
149
150	documents = ds.get_all_documents(filters={"year": ["2020", "2021"]})	×
151	assert len(documents) == 6	×
152
153	@pytest.mark.integration	2✔
154	def test_get_all_documents_with_incorrect_filter_name(self, ds, documents):	2✔
155	ds.write_documents(documents)	×
156	result = ds.get_all_documents(filters={"non_existing_meta_field": ["whatever"]})	×
157	assert len(result) == 0	×
158
159	@pytest.mark.integration	2✔
160	def test_get_all_documents_with_incorrect_filter_value(self, ds, documents):	2✔
161	ds.write_documents(documents)	×
162	result = ds.get_all_documents(filters={"year": ["nope"]})	×
163	assert len(result) == 0	×
164
165	@pytest.mark.integration	2✔
166	def test_eq_filters(self, ds, documents):	2✔
167	ds.write_documents(documents)	×
168
169	result = ds.get_all_documents(filters={"year": {"$eq": "2020"}})	×
170	assert len(result) == 3	×
171	result = ds.get_all_documents(filters={"year": "2020"})	×
172	assert len(result) == 3	×
173
174	@pytest.mark.integration	2✔
175	def test_in_filters(self, ds, documents):	2✔
176	ds.write_documents(documents)	×
177
178	result = ds.get_all_documents(filters={"year": {"$in": ["2020", "2021", "n.a."]}})	×
179	assert len(result) == 6	×
180	result = ds.get_all_documents(filters={"year": ["2020", "2021", "n.a."]})	×
181	assert len(result) == 6	×
182
183	@pytest.mark.integration	2✔
184	def test_ne_filters(self, ds, documents):	2✔
185	ds.write_documents(documents)	×
186
187	result = ds.get_all_documents(filters={"year": {"$ne": "2020"}})	×
188	assert len(result) == 6	×
189
190	@pytest.mark.integration	2✔
191	def test_nin_filters(self, ds, documents):	2✔
192	ds.write_documents(documents)	×
193
194	result = ds.get_all_documents(filters={"year": {"$nin": ["2020", "2021", "n.a."]}})	×
195	assert len(result) == 3	×
196
197	@pytest.mark.integration	2✔
198	def test_comparison_filters(self, ds, documents):	2✔
199	ds.write_documents(documents)	×
200
201	result = ds.get_all_documents(filters={"numbers": {"$gt": 0.0}})	×
202	assert len(result) == 3	×
203
204	result = ds.get_all_documents(filters={"numbers": {"$gte": -2.0}})	×
205	assert len(result) == 6	×
206
207	result = ds.get_all_documents(filters={"numbers": {"$lt": 0.0}})	×
208	assert len(result) == 3	×
209
210	result = ds.get_all_documents(filters={"numbers": {"$lte": 2.0}})	×
211	assert len(result) == 6	×
212
213	@pytest.mark.integration	2✔
214	def test_compound_filters(self, ds, documents):	2✔
215	ds.write_documents(documents)	×
216
217	result = ds.get_all_documents(filters={"year": {"$lte": "2021", "$gte": "2020"}})	×
218	assert len(result) == 6	×
219
220	@pytest.mark.integration	2✔
221	def test_simplified_filters(self, ds, documents):	2✔
222	ds.write_documents(documents)	×
223
224	filters = {"$and": {"year": {"$lte": "2021", "$gte": "2020"}, "name": {"$in": ["name_0", "name_1"]}}}	×
225	result = ds.get_all_documents(filters=filters)	×
226	assert len(result) == 4	×
227
228	filters_simplified = {"year": {"$lte": "2021", "$gte": "2020"}, "name": ["name_0", "name_1"]}	×
229	result = ds.get_all_documents(filters=filters_simplified)	×
230	assert len(result) == 4	×
231
232	@pytest.mark.integration	2✔
233	def test_nested_condition_filters(self, ds, documents):	2✔
234	ds.write_documents(documents)	×
235	filters = {	×
236	"$and": {
237	"year": {"$lte": "2021", "$gte": "2020"},
238	"$or": {"name": {"$in": ["name_0", "name_1"]}, "numbers": {"$lt": 5.0}},
239	}
240	}
241	result = ds.get_all_documents(filters=filters)	×
242	assert len(result) == 6	×
243
244	filters_simplified = {	×
245	"year": {"$lte": "2021", "$gte": "2020"},
246	"$or": {"name": {"$in": ["name_0", "name_2"]}, "numbers": {"$lt": 5.0}},
247	}
248	result = ds.get_all_documents(filters=filters_simplified)	×
249	assert len(result) == 6	×
250
251	filters = {	×
252	"$and": {
253	"year": {"$lte": "2021", "$gte": "2020"},
254	"$or": {
255	"name": {"$in": ["name_0", "name_1"]},
256	"$and": {"numbers": {"$lt": 5.0}, "$not": {"month": {"$eq": "01"}}},
257	},
258	}
259	}
260	result = ds.get_all_documents(filters=filters)	×
261	assert len(result) == 5	×
262
263	filters_simplified = {	×
264	"year": {"$lte": "2021", "$gte": "2020"},
265	"$or": {"name": ["name_0", "name_1"], "$and": {"numbers": {"$lt": 5.0}, "$not": {"month": {"$eq": "01"}}}},
266	}
267	result = ds.get_all_documents(filters=filters_simplified)	×
268	assert len(result) == 5	×
269
270	@pytest.mark.integration	2✔
271	def test_nested_condition_not_filters(self, ds, documents):	2✔
272	"""
273	Test nested logical operations within "$not", important as we apply De Morgan's laws in WeaviateDocumentstore
274	"""
275	ds.write_documents(documents)	×
276	filters = {	×
277	"$not": {
278	"$or": {
279	"$and": {"numbers": {"$lt": 5.0}, "month": {"$ne": "01"}},
280	"$not": {"year": {"$lte": "2021", "$gte": "2020"}},
281	}
282	}
283	}
284	result = ds.get_all_documents(filters=filters)	×
285	assert len(result) == 3	×
286
287	docs_meta = result[0].meta["numbers"]	×
288	assert [2, 4] == docs_meta	×
289
290	# Test same logical operator twice on same level
291
292	filters = {	×
293	"$or": [
294	{"$and": {"name": {"$in": ["name_0", "name_1"]}, "year": {"$gte": "2020"}}},
295	{"$and": {"name": {"$in": ["name_0", "name_1"]}, "year": {"$lt": "2021"}}},
296	]
297	}
298	result = ds.get_all_documents(filters=filters)	×
299	docs_meta = [doc.meta["name"] for doc in result]	×
300	assert len(result) == 4	×
301	assert "name_0" in docs_meta	×
302	assert "name_2" not in docs_meta	×
303
304	@pytest.mark.integration	2✔
305	def test_get_document_by_id(self, ds, documents):	2✔
306	ds.write_documents(documents)	×
307	doc = ds.get_document_by_id(documents[0].id)	×
308	assert doc.id == documents[0].id	×
309	assert doc.content == documents[0].content	×
310
311	@pytest.mark.integration	2✔
312	def test_get_documents_by_id(self, ds, documents):	2✔
313	ds.write_documents(documents)	×
314	ids = [doc.id for doc in documents]	×
315	result = {doc.id for doc in ds.get_documents_by_id(ids, batch_size=2)}	×
316	assert set(ids) == result	×
317
318	@pytest.mark.integration	2✔
319	def test_get_document_count(self, ds, documents):	2✔
320	ds.write_documents(documents)	×
321	assert ds.get_document_count() == len(documents)	×
322	assert ds.get_document_count(filters={"year": ["2020"]}) == 3	×
323	assert ds.get_document_count(filters={"month": ["02"]}) == 3	×
324
325	@pytest.mark.integration	2✔
326	def test_get_all_documents_generator(self, ds, documents):	2✔
327	ds.write_documents(documents)	×
328	assert len(list(ds.get_all_documents_generator(batch_size=2))) == 9	×
329
330	@pytest.mark.integration	2✔
331	def test_duplicate_documents_skip(self, ds, documents):	2✔
332	ds.write_documents(documents)	×
333
334	updated_docs = []	×
335	for d in documents:	×
336	updated_d = Document.from_dict(d.to_dict())	×
337	updated_d.meta["name"] = "Updated"	×
338	updated_docs.append(updated_d)	×
339
340	ds.write_documents(updated_docs, duplicate_documents="skip")	×
341	for d in ds.get_all_documents():	×
342	assert d.meta.get("name") != "Updated"	×
343
344	@pytest.mark.integration	2✔
345	def test_duplicate_documents_overwrite(self, ds, documents):	2✔
346	ds.write_documents(documents)	×
347
348	updated_docs = []	×
349	for d in documents:	×
350	updated_d = Document.from_dict(d.to_dict())	×
351	updated_d.meta["name"] = "Updated"	×
352	updated_docs.append(updated_d)	×
353
354	ds.write_documents(updated_docs, duplicate_documents="overwrite")	×
355	for doc in ds.get_all_documents():	×
356	assert doc.meta["name"] == "Updated"	×
357
358	@pytest.mark.integration	2✔
359	def test_duplicate_documents_fail(self, ds, documents):	2✔
360	ds.write_documents(documents)	×
361
362	updated_docs = []	×
363	for d in documents:	×
364	updated_d = Document.from_dict(d.to_dict())	×
365	updated_d.meta["name"] = "Updated"	×
366	updated_docs.append(updated_d)	×
367
368	with pytest.raises(DuplicateDocumentError):	×
369	ds.write_documents(updated_docs, duplicate_documents="fail")	×
370
371	@pytest.mark.integration	2✔
372	def test_write_document_meta(self, ds):	2✔
373	ds.write_documents(	×
374	[
375	{"content": "dict_without_meta", "id": "1"},
376	{"content": "dict_with_meta", "meta_field": "test2", "id": "2"},
377	Document(content="document_object_without_meta", id="3"),
378	Document(content="document_object_with_meta", meta={"meta_field": "test4"}, id="4"),
379	]
380	)
381	assert not ds.get_document_by_id("1").meta	×
382	assert ds.get_document_by_id("2").meta["meta_field"] == "test2"	×
383	assert not ds.get_document_by_id("3").meta	×
384	assert ds.get_document_by_id("4").meta["meta_field"] == "test4"	×
385
386	@pytest.mark.integration	2✔
387	def test_delete_documents(self, ds, documents):	2✔
388	ds.write_documents(documents)	×
389	ds.delete_documents()	×
390	assert ds.get_document_count() == 0	×
391
392	@pytest.mark.integration	2✔
393	def test_delete_documents_with_filters(self, ds, documents):	2✔
394	ds.write_documents(documents)	×
395	ds.delete_documents(filters={"year": ["2020", "2021"]})	×
396	documents = ds.get_all_documents()	×
397	assert ds.get_document_count() == 3	×
398
399	@pytest.mark.integration	2✔
400	def test_delete_documents_by_id(self, ds, documents):	2✔
401	ds.write_documents(documents)	×
402	docs_to_delete = ds.get_all_documents(filters={"year": ["2020"]})	×
403	ds.delete_documents(ids=[doc.id for doc in docs_to_delete])	×
404	assert ds.get_document_count() == 6	×
405
406	@pytest.mark.integration	2✔
407	def test_delete_documents_by_id_with_filters(self, ds, documents):	2✔
408	ds.write_documents(documents)	×
409	docs_to_delete = ds.get_all_documents(filters={"year": ["2020"]})	×
410	# this should delete only 1 document out of the 3 ids passed
411	ds.delete_documents(ids=[doc.id for doc in docs_to_delete], filters={"name": ["name_0"]})	×
412	assert ds.get_document_count() == 8	×
413
414	@pytest.mark.integration	2✔
415	def test_write_get_all_labels(self, ds, labels):	2✔
416	ds.write_labels(labels)	×
417	ds.write_labels(labels[:3], index="custom_index")	×
418	assert len(ds.get_all_labels()) == 9	×
419	assert len(ds.get_all_labels(index="custom_index")) == 3	×
420	# remove the index we created in this test
421	ds.delete_index("custom_index")	×
422
423	@pytest.mark.integration	2✔
424	def test_delete_labels(self, ds, labels):	2✔
425	ds.write_labels(labels)	×
426	ds.write_labels(labels[:3], index="custom_index")	×
427	ds.delete_labels()	×
428	ds.delete_labels(index="custom_index")	×
429	assert len(ds.get_all_labels()) == 0	×
430	assert len(ds.get_all_labels(index="custom_index")) == 0	×
431	# remove the index we created in this test
432	ds.delete_index("custom_index")	×
433
434	@pytest.mark.integration	2✔
435	def test_write_labels_duplicate(self, ds, labels):	2✔
436	# create a duplicate
437	dupe = Label.from_dict(labels[0].to_dict())	×
438
439	ds.write_labels(labels + [dupe])	×
440
441	# ensure the duplicate was discarded
442	assert len(ds.get_all_labels()) == len(labels)	×
443
444	@pytest.mark.integration	2✔
445	def test_delete_labels_by_id(self, ds, labels):	2✔
446	ds.write_labels(labels)	×
447	ds.delete_labels(ids=[labels[0].id])	×
448	assert len(ds.get_all_labels()) == len(labels) - 1	×
449
450	@pytest.mark.integration	2✔
451	def test_delete_labels_by_filter(self, ds, labels):	2✔
452	ds.write_labels(labels)	×
453	ds.delete_labels(filters={"query": "query_1"})	×
454	assert len(ds.get_all_labels()) == len(labels) - 1	×
455
456	@pytest.mark.integration	2✔
457	def test_delete_labels_by_filter_id(self, ds, labels):	2✔
458	ds.write_labels(labels)	×
459
460	# ids and filters are ANDed, the following should have no effect
461	ds.delete_labels(ids=[labels[0].id], filters={"query": "query_9"})	×
462	assert len(ds.get_all_labels()) == len(labels)	×
463
464	#
465	ds.delete_labels(ids=[labels[0].id], filters={"query": "query_0"})	×
466	assert len(ds.get_all_labels()) == len(labels) - 1	×
467
468	@pytest.mark.integration	2✔
469	def test_get_label_count(self, ds, labels):	2✔
470	ds.write_labels(labels)	×
471	assert ds.get_label_count() == len(labels)	×
472
473	@pytest.mark.integration	2✔
474	def test_delete_index(self, ds, documents):	2✔
475	ds.write_documents(documents, index="custom_index")	×
476	assert ds.get_document_count(index="custom_index") == len(documents)	×
477	ds.delete_index(index="custom_index")	×
478	with pytest.raises(Exception):	×
479	ds.get_document_count(index="custom_index")	×
480
481	@pytest.mark.integration	2✔
482	def test_delete_index_does_not_raise_if_not_exists(self, ds):	2✔
483	ds.delete_index(index="unknown_index")	×
484
485	@pytest.mark.integration	2✔
486	def test_update_meta(self, ds, documents):	2✔
487	ds.write_documents(documents)	×
488	doc = documents[0]	×
489	ds.update_document_meta(doc.id, meta={"year": "2099", "month": "12"})	×
490	doc = ds.get_document_by_id(doc.id)	×
491	assert doc.meta["year"] == "2099"	×
492	assert doc.meta["month"] == "12"	×
493
494	@pytest.mark.integration	2✔
495	def test_labels_with_long_texts(self, ds, documents):	2✔
496	label = Label(	×
497	query="question1",
498	answer=Answer(
499	answer="answer",
500	type="extractive",
501	score=0.0,
502	context="something " * 10_000,
503	offsets_in_document=[Span(start=12, end=14)],
504	offsets_in_context=[Span(start=12, end=14)],
505	),
506	is_correct_answer=True,
507	is_correct_document=True,
508	document=Document(content="something " * 10_000, id="123"),
509	origin="gold-label",
510	)
511	ds.write_labels(labels=[label])	×
512	labels = ds.get_all_labels()	×
513	assert len(labels) == 1	×
514	assert label == labels[0]	×
515
516	@pytest.mark.integration	2✔
517	@pytest.mark.skipif(sys.platform == "win32", reason="_get_documents_meta() fails with 'too many SQL variables'")	2✔
518	def test_get_all_documents_large_quantities(self, ds):	2✔
519	# Test to exclude situations like Weaviate not returning more than 100 docs by default
520	# https://github.com/deepset-ai/haystack/issues/1893
521	docs_to_write = [	×
522	{"meta": {"name": f"name_{i}"}, "content": f"text_{i}", "embedding": np.random.rand(768).astype(np.float32)}
523	for i in range(1000)
524	]
525	ds.write_documents(docs_to_write)	×
526	documents = ds.get_all_documents()	×
527	assert all(isinstance(d, Document) for d in documents)	×
528	assert len(documents) == len(docs_to_write)	×
529
530	@pytest.mark.integration	2✔
531	def test_custom_embedding_field(self, ds):	2✔
532	ds.embedding_field = "custom_embedding_field"	×
533	doc_to_write = {"content": "test", "custom_embedding_field": np.random.rand(768).astype(np.float32)}	×
534	ds.write_documents([doc_to_write])	×
535	documents = ds.get_all_documents(return_embedding=True)	×
536	assert len(documents) == 1	×
537	assert documents[0].content == "test"	×
538	# Some document stores normalize the embedding on save, let's just compare the length
539	assert doc_to_write["custom_embedding_field"].shape == documents[0].embedding.shape	×
540
541	@pytest.mark.skip(reason="This currently fails for Weaviate and Pinecone")	2✔
542	@pytest.mark.integration	2✔
543	@pytest.mark.parametrize("batch_size", [None, 20])	2✔
544	def test_add_eval_data(self, ds, batch_size, samples_path):	2✔
545	# add eval data (SQUAD format)
546	ds.add_eval_data(	×
547	filename=samples_path / "squad" / "small.json",
548	doc_index=ds.index,
549	label_index=ds.label_index,
550	batch_size=batch_size,
551	)
552	assert ds.get_document_count() == 87	×
553	assert ds.get_label_count() == 1214	×
554
555	#
556	# Unit tests
557	#
558
559	@pytest.mark.unit	2✔
560	def test_normalize_embeddings_diff_shapes(self):	2✔
561	VEC_1 = np.array([0.1, 0.2, 0.3], dtype="float32")	2✔
562	BaseDocumentStore.normalize_embedding(VEC_1)	2✔
563	assert np.linalg.norm(VEC_1) - 1 < 0.01	2✔
564
565	VEC_1 = np.array([0.1, 0.2, 0.3], dtype="float32").reshape(1, -1)	2✔
566	BaseDocumentStore.normalize_embedding(VEC_1)	2✔
567	assert np.linalg.norm(VEC_1) - 1 < 0.01	2✔

deepset-ai / haystack / 5646559600

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous