13634803133

Committed 03 Mar 2025 03:47PM UTC coverage: 90.124% (+0.1%) from 89.986%

Build # 13634803133

Build Type

Pull #8906

github

Committed by

web-flow

Commit Message

Merge e48e49114 into 1b2053b35

Pull Request Pull Request #8906: refactor!: remove `dataframe` field from `Document` and `ExtractedTableAnswer`; make `pandas` optional

Run Details

9536 of 10581 relevant lines covered (90.12%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.48

haystack/dataclasses/document.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

import hashlib
from dataclasses import asdict, dataclass, field, fields
from typing import Any, Dict, List, Optional

from numpy import ndarray

from haystack import logging
from haystack.dataclasses.byte_stream import ByteStream
from haystack.dataclasses.sparse_embedding import SparseEmbedding

logger = logging.getLogger(__name__)


class _BackwardCompatible(type):
    """
    Metaclass that handles Document backward compatibility.
    """

    def __call__(cls, *args, **kwargs):
        """
        Called before Document.__init__, will remap legacy fields to new ones.

        Also handles building a Document from a flattened dictionary.
        Dataframe is not supported anymore.
        """
        ### Conversion from 1.x Document ###
        content = kwargs.get("content")
        if content and not isinstance(content, str):
            raise ValueError("The `content` field must be a string or None.")

        # Not used anymore
        if "content_type" in kwargs:
            del kwargs["content_type"]

        # Embedding were stored as NumPy arrays in 1.x, so we convert it to the new type
        if isinstance(embedding := kwargs.get("embedding"), ndarray):
            kwargs["embedding"] = embedding.tolist()

        # id_hash_keys is not used anymore
        if "id_hash_keys" in kwargs:
            del kwargs["id_hash_keys"]

        ### >=2.11: Dataframe is not supported anymore ###
        if "dataframe" in kwargs:
            raise ValueError("The `dataframe` field is no longer supported.")

        return super().__call__(*args, **kwargs)


@dataclass
class Document(metaclass=_BackwardCompatible):
    """
    Base data class containing some data to be queried.

    Can contain text snippets and file paths to images or audios. Documents can be sorted by score and saved
    to/from dictionary and JSON.

    :param id: Unique identifier for the document. When not set, it's generated based on the Document fields' values.
    :param content: Text of the document, if the document contains text.
    :param blob: Binary data associated with the document, if the document has any binary data associated with it.
    :param meta: Additional custom metadata for the document. Must be JSON-serializable.
    :param score: Score of the document. Used for ranking, usually assigned by retrievers.
    :param embedding: dense vector representation of the document.
    :param sparse_embedding: sparse vector representation of the document.
    """

    id: str = field(default="")
    content: Optional[str] = field(default=None)
    blob: Optional[ByteStream] = field(default=None)
    meta: Dict[str, Any] = field(default_factory=dict)
    score: Optional[float] = field(default=None)
    embedding: Optional[List[float]] = field(default=None)
    sparse_embedding: Optional[SparseEmbedding] = field(default=None)

    def __repr__(self):
        fields = []
        if self.content is not None:
            fields.append(
                f"content: '{self.content}'" if len(self.content) < 100 else f"content: '{self.content[:100]}...'"
            )
        if self.blob is not None:
            fields.append(f"blob: {len(self.blob.data)} bytes")
        if len(self.meta) > 0:
            fields.append(f"meta: {self.meta}")
        if self.score is not None:
            fields.append(f"score: {self.score}")
        if self.embedding is not None:
            fields.append(f"embedding: vector of size {len(self.embedding)}")
        if self.sparse_embedding is not None:
            fields.append(f"sparse_embedding: vector with {len(self.sparse_embedding.indices)} non-zero elements")
        fields_str = ", ".join(fields)
        return f"{self.__class__.__name__}(id={self.id}, {fields_str})"

    def __eq__(self, other):
        """
        Compares Documents for equality.

        Two Documents are considered equals if their dictionary representation is identical.
        """
        if type(self) != type(other):
            return False
        return self.to_dict() == other.to_dict()

    def __post_init__(self):
        """
        Generate the ID based on the init parameters.
        """
        # Generate an id only if not explicitly set
        self.id = self.id or self._create_id()

    def _create_id(self):
        """
        Creates a hash of the given content that acts as the document's ID.
        """
        text = self.content or None
        dataframe = None  # this allows the ID creation to remain unchanged even if the dataframe field has been removed
        blob = self.blob.data if self.blob is not None else None
        mime_type = self.blob.mime_type if self.blob is not None else None
        meta = self.meta or {}
        embedding = self.embedding if self.embedding is not None else None
        sparse_embedding = self.sparse_embedding.to_dict() if self.sparse_embedding is not None else ""
        data = f"{text}{dataframe}{blob}{mime_type}{meta}{embedding}{sparse_embedding}"
        return hashlib.sha256(data.encode("utf-8")).hexdigest()

    def to_dict(self, flatten=True) -> Dict[str, Any]:
        """
        Converts Document into a dictionary.

        `blob` field is converted to JSON-serializable types.

        :param flatten:
            Whether to flatten `meta` field or not. Defaults to `True` to be backward-compatible with Haystack 1.x.
        """
        data = asdict(self)
        if (blob := data.get("blob")) is not None:
            data["blob"] = {"data": list(blob["data"]), "mime_type": blob["mime_type"]}

        if flatten:
            meta = data.pop("meta")
            return {**data, **meta}

        return data

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "Document":
        """
        Creates a new Document object from a dictionary.

        The `blob` field is converted to its original type.
        """
        if blob := data.get("blob"):
            data["blob"] = ByteStream(data=bytes(blob["data"]), mime_type=blob["mime_type"])
        if sparse_embedding := data.get("sparse_embedding"):
            data["sparse_embedding"] = SparseEmbedding.from_dict(sparse_embedding)

        # Store metadata for a moment while we try un-flattening allegedly flatten metadata.
        # We don't expect both a `meta=` keyword and flatten metadata keys so we'll raise a
        # ValueError later if this is the case.
        meta = data.pop("meta", {})
        # Unflatten metadata if it was flattened. We assume any keyword argument that's not
        # a document field is a metadata key. We treat legacy fields as document fields
        # for backward compatibility.
        flatten_meta = {}
        legacy_fields = ["content_type", "id_hash_keys"]
        document_fields = legacy_fields + [f.name for f in fields(cls)]
        for key in list(data.keys()):
            if key not in document_fields:
                flatten_meta[key] = data.pop(key)

        # We don't support passing both flatten keys and the `meta` keyword parameter
        if meta and flatten_meta:
            raise ValueError(
                "You can pass either the 'meta' parameter or flattened metadata keys as keyword arguments, "
                "but currently you're passing both. Pass either the 'meta' parameter or flattened metadata keys."
            )

        # Finally put back all the metadata
        return cls(**data, meta={**meta, **flatten_meta})

    @property
    def content_type(self):
        """
        Returns the type of the content for the document.

        This is necessary to keep backward compatibility with 1.x.
        """
        if self.content is not None:
            return "text"
        raise ValueError("Content is not set.")

1	# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2	#
3	# SPDX-License-Identifier: Apache-2.0
4
5	import hashlib	1✔
6	from dataclasses import asdict, dataclass, field, fields	1✔
7	from typing import Any, Dict, List, Optional	1✔
8
9	from numpy import ndarray	1✔
10
11	from haystack import logging	1✔
12	from haystack.dataclasses.byte_stream import ByteStream	1✔
13	from haystack.dataclasses.sparse_embedding import SparseEmbedding	1✔
14
15	logger = logging.getLogger(__name__)	1✔
16
17
18	class _BackwardCompatible(type):	1✔
19	"""
20	Metaclass that handles Document backward compatibility.
21	"""
22
23	def __call__(cls, args, *kwargs):	1✔
24	"""
25	Called before Document.__init__, will remap legacy fields to new ones.
26
27	Also handles building a Document from a flattened dictionary.
28	Dataframe is not supported anymore.
29	"""
30	### Conversion from 1.x Document ###
31	content = kwargs.get("content")	1✔
32	if content and not isinstance(content, str):	1✔
33	raise ValueError("The `content` field must be a string or None.")	×
34
35	# Not used anymore
36	if "content_type" in kwargs:	1✔
37	del kwargs["content_type"]	1✔
38
39	# Embedding were stored as NumPy arrays in 1.x, so we convert it to the new type
40	if isinstance(embedding := kwargs.get("embedding"), ndarray):	1✔
41	kwargs["embedding"] = embedding.tolist()	×
42
43	# id_hash_keys is not used anymore
44	if "id_hash_keys" in kwargs:	1✔
45	del kwargs["id_hash_keys"]	1✔
46
47	### >=2.11: Dataframe is not supported anymore ###
48	if "dataframe" in kwargs:	1✔
49	raise ValueError("The `dataframe` field is no longer supported.")	1✔
50
51	return super().__call__(args, *kwargs)	1✔
52
53
54	@dataclass	1✔
55	class Document(metaclass=_BackwardCompatible):	1✔
56	"""
57	Base data class containing some data to be queried.
58
59	Can contain text snippets and file paths to images or audios. Documents can be sorted by score and saved
60	to/from dictionary and JSON.
61
62	:param id: Unique identifier for the document. When not set, it's generated based on the Document fields' values.
63	:param content: Text of the document, if the document contains text.
64	:param blob: Binary data associated with the document, if the document has any binary data associated with it.
65	:param meta: Additional custom metadata for the document. Must be JSON-serializable.
66	:param score: Score of the document. Used for ranking, usually assigned by retrievers.
67	:param embedding: dense vector representation of the document.
68	:param sparse_embedding: sparse vector representation of the document.
69	"""
70
71	id: str = field(default="")	1✔
72	content: Optional[str] = field(default=None)	1✔
73	blob: Optional[ByteStream] = field(default=None)	1✔
74	meta: Dict[str, Any] = field(default_factory=dict)	1✔
75	score: Optional[float] = field(default=None)	1✔
76	embedding: Optional[List[float]] = field(default=None)	1✔
77	sparse_embedding: Optional[SparseEmbedding] = field(default=None)	1✔
78
79	def __repr__(self):	1✔
80	fields = []	1✔
81	if self.content is not None:	1✔
82	fields.append(	1✔
83	f"content: '{self.content}'" if len(self.content) < 100 else f"content: '{self.content[:100]}...'"
84	)
85	if self.blob is not None:	1✔
86	fields.append(f"blob: {len(self.blob.data)} bytes")	1✔
87	if len(self.meta) > 0:	1✔
88	fields.append(f"meta: {self.meta}")	×
89	if self.score is not None:	1✔
90	fields.append(f"score: {self.score}")	×
91	if self.embedding is not None:	1✔
92	fields.append(f"embedding: vector of size {len(self.embedding)}")	×
93	if self.sparse_embedding is not None:	1✔
94	fields.append(f"sparse_embedding: vector with {len(self.sparse_embedding.indices)} non-zero elements")	×
95	fields_str = ", ".join(fields)	1✔
96	return f"{self.__class__.__name__}(id={self.id}, {fields_str})"	1✔
97
98	def __eq__(self, other):	1✔
99	"""
100	Compares Documents for equality.
101
102	Two Documents are considered equals if their dictionary representation is identical.
103	"""
104	if type(self) != type(other):	1✔
105	return False	1✔
106	return self.to_dict() == other.to_dict()	1✔
107
108	def __post_init__(self):	1✔
109	"""
110	Generate the ID based on the init parameters.
111	"""
112	# Generate an id only if not explicitly set
113	self.id = self.id or self._create_id()	1✔
114
115	def _create_id(self):	1✔
116	"""
117	Creates a hash of the given content that acts as the document's ID.
118	"""
119	text = self.content or None	1✔
120	dataframe = None # this allows the ID creation to remain unchanged even if the dataframe field has been removed	1✔
121	blob = self.blob.data if self.blob is not None else None	1✔
122	mime_type = self.blob.mime_type if self.blob is not None else None	1✔
123	meta = self.meta or {}	1✔
124	embedding = self.embedding if self.embedding is not None else None	1✔
125	sparse_embedding = self.sparse_embedding.to_dict() if self.sparse_embedding is not None else ""	1✔
126	data = f"{text}{dataframe}{blob}{mime_type}{meta}{embedding}{sparse_embedding}"	1✔
127	return hashlib.sha256(data.encode("utf-8")).hexdigest()	1✔
128
129	def to_dict(self, flatten=True) -> Dict[str, Any]:	1✔
130	"""
131	Converts Document into a dictionary.
132
133	`blob` field is converted to JSON-serializable types.
134
135	:param flatten:
136	Whether to flatten `meta` field or not. Defaults to `True` to be backward-compatible with Haystack 1.x.
137	"""
138	data = asdict(self)	1✔
139	if (blob := data.get("blob")) is not None:	1✔
140	data["blob"] = {"data": list(blob["data"]), "mime_type": blob["mime_type"]}	1✔
141
142	if flatten:	1✔
143	meta = data.pop("meta")	1✔
144	return {data, meta}	1✔
145
146	return data	1✔
147
148	@classmethod	1✔
149	def from_dict(cls, data: Dict[str, Any]) -> "Document":	1✔
150	"""
151	Creates a new Document object from a dictionary.
152
153	The `blob` field is converted to its original type.
154	"""
155	if blob := data.get("blob"):	1✔
156	data["blob"] = ByteStream(data=bytes(blob["data"]), mime_type=blob["mime_type"])	1✔
157	if sparse_embedding := data.get("sparse_embedding"):	1✔
158	data["sparse_embedding"] = SparseEmbedding.from_dict(sparse_embedding)	1✔
159
160	# Store metadata for a moment while we try un-flattening allegedly flatten metadata.
161	# We don't expect both a `meta=` keyword and flatten metadata keys so we'll raise a
162	# ValueError later if this is the case.
163	meta = data.pop("meta", {})	1✔
164	# Unflatten metadata if it was flattened. We assume any keyword argument that's not
165	# a document field is a metadata key. We treat legacy fields as document fields
166	# for backward compatibility.
167	flatten_meta = {}	1✔
168	legacy_fields = ["content_type", "id_hash_keys"]	1✔
169	document_fields = legacy_fields + [f.name for f in fields(cls)]	1✔
170	for key in list(data.keys()):	1✔
171	if key not in document_fields:	1✔
172	flatten_meta[key] = data.pop(key)	1✔
173
174	# We don't support passing both flatten keys and the `meta` keyword parameter
175	if meta and flatten_meta:	1✔
176	raise ValueError(	1✔
177	"You can pass either the 'meta' parameter or flattened metadata keys as keyword arguments, "
178	"but currently you're passing both. Pass either the 'meta' parameter or flattened metadata keys."
179	)
180
181	# Finally put back all the metadata
182	return cls(data, meta={meta, **flatten_meta})	1✔
183
184	@property	1✔
185	def content_type(self):	1✔
186	"""
187	Returns the type of the content for the document.
188
189	This is necessary to keep backward compatibility with 1.x.
190	"""
191	if self.content is not None:	1✔
192	return "text"	1✔
193	raise ValueError("Content is not set.")	1✔

deepset-ai / haystack / 13634803133

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous