13074273511

Committed 31 Jan 2025 02:03PM UTC coverage: 91.359% (+0.007%) from 91.352%

Build # 13074273511

Build Type

push

github

Committed by

web-flow

Commit Message

deprecate dataframe and ExtractedTableAnswer (#8789)

Run Details

8871 of 9710 relevant lines covered (91.36%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.52

haystack/dataclasses/document.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

import hashlib
import io
import warnings
from dataclasses import asdict, dataclass, field, fields
from typing import Any, Dict, List, Optional

from numpy import ndarray
from pandas import DataFrame, read_json

from haystack import logging
from haystack.dataclasses.byte_stream import ByteStream
from haystack.dataclasses.sparse_embedding import SparseEmbedding

logger = logging.getLogger(__name__)


class _BackwardCompatible(type):
    """
    Metaclass that handles Document backward compatibility.
    """

    def __call__(cls, *args, **kwargs):
        """
        Called before Document.__init__, will remap legacy fields to new ones.

        Also handles building a Document from a flattened dictionary.
        """
        # Move `content` to new fields depending on the type
        content = kwargs.get("content")
        if isinstance(content, DataFrame):
            kwargs["dataframe"] = content
            del kwargs["content"]

        # Not used anymore
        if "content_type" in kwargs:
            del kwargs["content_type"]

        # Embedding were stored as NumPy arrays in 1.x, so we convert it to the new type
        if isinstance(embedding := kwargs.get("embedding"), ndarray):
            kwargs["embedding"] = embedding.tolist()

        # id_hash_keys is not used anymore
        if "id_hash_keys" in kwargs:
            del kwargs["id_hash_keys"]

        return super().__call__(*args, **kwargs)


@dataclass
class Document(metaclass=_BackwardCompatible):
    """
    Base data class containing some data to be queried.

    Can contain text snippets, tables, and file paths to images or audios. Documents can be sorted by score and saved
    to/from dictionary and JSON.

    :param id: Unique identifier for the document. When not set, it's generated based on the Document fields' values.
    :param content: Text of the document, if the document contains text.
    :param dataframe: Pandas dataframe with the document's content, if the document contains tabular data.
    :param blob: Binary data associated with the document, if the document has any binary data associated with it.
    :param meta: Additional custom metadata for the document. Must be JSON-serializable.
    :param score: Score of the document. Used for ranking, usually assigned by retrievers.
    :param embedding: dense vector representation of the document.
    :param sparse_embedding: sparse vector representation of the document.
    """

    id: str = field(default="")
    content: Optional[str] = field(default=None)
    dataframe: Optional[DataFrame] = field(default=None)
    blob: Optional[ByteStream] = field(default=None)
    meta: Dict[str, Any] = field(default_factory=dict)
    score: Optional[float] = field(default=None)
    embedding: Optional[List[float]] = field(default=None)
    sparse_embedding: Optional[SparseEmbedding] = field(default=None)

    def __repr__(self):
        fields = []
        if self.content is not None:
            fields.append(
                f"content: '{self.content}'" if len(self.content) < 100 else f"content: '{self.content[:100]}...'"
            )
        if self.dataframe is not None:
            fields.append(f"dataframe: {self.dataframe.shape}")
        if self.blob is not None:
            fields.append(f"blob: {len(self.blob.data)} bytes")
        if len(self.meta) > 0:
            fields.append(f"meta: {self.meta}")
        if self.score is not None:
            fields.append(f"score: {self.score}")
        if self.embedding is not None:
            fields.append(f"embedding: vector of size {len(self.embedding)}")
        if self.sparse_embedding is not None:
            fields.append(f"sparse_embedding: vector with {len(self.sparse_embedding.indices)} non-zero elements")
        fields_str = ", ".join(fields)
        return f"{self.__class__.__name__}(id={self.id}, {fields_str})"

    def __eq__(self, other):
        """
        Compares Documents for equality.

        Two Documents are considered equals if their dictionary representation is identical.
        """
        if type(self) != type(other):
            return False
        return self.to_dict() == other.to_dict()

    def __post_init__(self):
        """
        Generate the ID based on the init parameters.
        """
        # Generate an id only if not explicitly set
        self.id = self.id or self._create_id()

        if self.dataframe is not None:
            msg = "The `dataframe` field is deprecated and will be removed in Haystack 2.11.0."
            warnings.warn(msg, DeprecationWarning)

    def _create_id(self):
        """
        Creates a hash of the given content that acts as the document's ID.
        """
        text = self.content or None
        dataframe = self.dataframe.to_json() if self.dataframe is not None else None
        blob = self.blob.data if self.blob is not None else None
        mime_type = self.blob.mime_type if self.blob is not None else None
        meta = self.meta or {}
        embedding = self.embedding if self.embedding is not None else None
        sparse_embedding = self.sparse_embedding.to_dict() if self.sparse_embedding is not None else ""
        data = f"{text}{dataframe}{blob}{mime_type}{meta}{embedding}{sparse_embedding}"
        return hashlib.sha256(data.encode("utf-8")).hexdigest()

    def to_dict(self, flatten=True) -> Dict[str, Any]:
        """
        Converts Document into a dictionary.

        `dataframe` and `blob` fields are converted to JSON-serializable types.

        :param flatten:
            Whether to flatten `meta` field or not. Defaults to `True` to be backward-compatible with Haystack 1.x.
        """
        data = asdict(self)
        if (dataframe := data.get("dataframe")) is not None:
            data["dataframe"] = dataframe.to_json()
        if (blob := data.get("blob")) is not None:
            data["blob"] = {"data": list(blob["data"]), "mime_type": blob["mime_type"]}

        if flatten:
            meta = data.pop("meta")
            return {**data, **meta}

        return data

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "Document":
        """
        Creates a new Document object from a dictionary.

        The `dataframe` and `blob` fields are converted to their original types.
        """
        if (dataframe := data.get("dataframe")) is not None:
            data["dataframe"] = read_json(io.StringIO(dataframe))
        if blob := data.get("blob"):
            data["blob"] = ByteStream(data=bytes(blob["data"]), mime_type=blob["mime_type"])
        if sparse_embedding := data.get("sparse_embedding"):
            data["sparse_embedding"] = SparseEmbedding.from_dict(sparse_embedding)

        # Store metadata for a moment while we try un-flattening allegedly flatten metadata.
        # We don't expect both a `meta=` keyword and flatten metadata keys so we'll raise a
        # ValueError later if this is the case.
        meta = data.pop("meta", {})
        # Unflatten metadata if it was flattened. We assume any keyword argument that's not
        # a document field is a metadata key. We treat legacy fields as document fields
        # for backward compatibility.
        flatten_meta = {}
        legacy_fields = ["content_type", "id_hash_keys"]
        document_fields = legacy_fields + [f.name for f in fields(cls)]
        for key in list(data.keys()):
            if key not in document_fields:
                flatten_meta[key] = data.pop(key)

        # We don't support passing both flatten keys and the `meta` keyword parameter
        if meta and flatten_meta:
            raise ValueError(
                "You can pass either the 'meta' parameter or flattened metadata keys as keyword arguments, "
                "but currently you're passing both. Pass either the 'meta' parameter or flattened metadata keys."
            )

        # Finally put back all the metadata
        return cls(**data, meta={**meta, **flatten_meta})

    @property
    def content_type(self):
        """
        Returns the type of the content for the document.

        This is necessary to keep backward compatibility with 1.x.

        :raises ValueError:
            If both `text` and `dataframe` fields are set or both are missing.
        """
        if self.content is not None and self.dataframe is not None:
            raise ValueError("Both text and dataframe are set.")

        if self.content is not None:
            return "text"
        elif self.dataframe is not None:
            return "table"
        raise ValueError("Neither text nor dataframe is set.")

1	# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2	#
3	# SPDX-License-Identifier: Apache-2.0
4
5	import hashlib	1✔
6	import io	1✔
7	import warnings	1✔
8	from dataclasses import asdict, dataclass, field, fields	1✔
9	from typing import Any, Dict, List, Optional	1✔
10
11	from numpy import ndarray	1✔
12	from pandas import DataFrame, read_json	1✔
13
14	from haystack import logging	1✔
15	from haystack.dataclasses.byte_stream import ByteStream	1✔
16	from haystack.dataclasses.sparse_embedding import SparseEmbedding	1✔
17
18	logger = logging.getLogger(__name__)	1✔
19
20
21	class _BackwardCompatible(type):	1✔
22	"""
23	Metaclass that handles Document backward compatibility.
24	"""
25
26	def __call__(cls, args, *kwargs):	1✔
27	"""
28	Called before Document.__init__, will remap legacy fields to new ones.
29
30	Also handles building a Document from a flattened dictionary.
31	"""
32	# Move `content` to new fields depending on the type
33	content = kwargs.get("content")	1✔
34	if isinstance(content, DataFrame):	1✔
35	kwargs["dataframe"] = content	×
36	del kwargs["content"]	×
37
38	# Not used anymore
39	if "content_type" in kwargs:	1✔
40	del kwargs["content_type"]	1✔
41
42	# Embedding were stored as NumPy arrays in 1.x, so we convert it to the new type
43	if isinstance(embedding := kwargs.get("embedding"), ndarray):	1✔
44	kwargs["embedding"] = embedding.tolist()	×
45
46	# id_hash_keys is not used anymore
47	if "id_hash_keys" in kwargs:	1✔
48	del kwargs["id_hash_keys"]	1✔
49
50	return super().__call__(args, *kwargs)	1✔
51
52
53	@dataclass	1✔
54	class Document(metaclass=_BackwardCompatible):	1✔
55	"""
56	Base data class containing some data to be queried.
57
58	Can contain text snippets, tables, and file paths to images or audios. Documents can be sorted by score and saved
59	to/from dictionary and JSON.
60
61	:param id: Unique identifier for the document. When not set, it's generated based on the Document fields' values.
62	:param content: Text of the document, if the document contains text.
63	:param dataframe: Pandas dataframe with the document's content, if the document contains tabular data.
64	:param blob: Binary data associated with the document, if the document has any binary data associated with it.
65	:param meta: Additional custom metadata for the document. Must be JSON-serializable.
66	:param score: Score of the document. Used for ranking, usually assigned by retrievers.
67	:param embedding: dense vector representation of the document.
68	:param sparse_embedding: sparse vector representation of the document.
69	"""
70
71	id: str = field(default="")	1✔
72	content: Optional[str] = field(default=None)	1✔
73	dataframe: Optional[DataFrame] = field(default=None)	1✔
74	blob: Optional[ByteStream] = field(default=None)	1✔
75	meta: Dict[str, Any] = field(default_factory=dict)	1✔
76	score: Optional[float] = field(default=None)	1✔
77	embedding: Optional[List[float]] = field(default=None)	1✔
78	sparse_embedding: Optional[SparseEmbedding] = field(default=None)	1✔
79
80	def __repr__(self):	1✔
81	fields = []	1✔
82	if self.content is not None:	1✔
83	fields.append(	1✔
84	f"content: '{self.content}'" if len(self.content) < 100 else f"content: '{self.content[:100]}...'"
85	)
86	if self.dataframe is not None:	1✔
87	fields.append(f"dataframe: {self.dataframe.shape}")	1✔
88	if self.blob is not None:	1✔
89	fields.append(f"blob: {len(self.blob.data)} bytes")	1✔
90	if len(self.meta) > 0:	1✔
91	fields.append(f"meta: {self.meta}")	×
92	if self.score is not None:	1✔
93	fields.append(f"score: {self.score}")	×
94	if self.embedding is not None:	1✔
95	fields.append(f"embedding: vector of size {len(self.embedding)}")	×
96	if self.sparse_embedding is not None:	1✔
97	fields.append(f"sparse_embedding: vector with {len(self.sparse_embedding.indices)} non-zero elements")	×
98	fields_str = ", ".join(fields)	1✔
99	return f"{self.__class__.__name__}(id={self.id}, {fields_str})"	1✔
100
101	def __eq__(self, other):	1✔
102	"""
103	Compares Documents for equality.
104
105	Two Documents are considered equals if their dictionary representation is identical.
106	"""
107	if type(self) != type(other):	1✔
108	return False	1✔
109	return self.to_dict() == other.to_dict()	1✔
110
111	def __post_init__(self):	1✔
112	"""
113	Generate the ID based on the init parameters.
114	"""
115	# Generate an id only if not explicitly set
116	self.id = self.id or self._create_id()	1✔
117
118	if self.dataframe is not None:	1✔
119	msg = "The `dataframe` field is deprecated and will be removed in Haystack 2.11.0."	1✔
120	warnings.warn(msg, DeprecationWarning)	1✔
121
122	def _create_id(self):	1✔
123	"""
124	Creates a hash of the given content that acts as the document's ID.
125	"""
126	text = self.content or None	1✔
127	dataframe = self.dataframe.to_json() if self.dataframe is not None else None	1✔
128	blob = self.blob.data if self.blob is not None else None	1✔
129	mime_type = self.blob.mime_type if self.blob is not None else None	1✔
130	meta = self.meta or {}	1✔
131	embedding = self.embedding if self.embedding is not None else None	1✔
132	sparse_embedding = self.sparse_embedding.to_dict() if self.sparse_embedding is not None else ""	1✔
133	data = f"{text}{dataframe}{blob}{mime_type}{meta}{embedding}{sparse_embedding}"	1✔
134	return hashlib.sha256(data.encode("utf-8")).hexdigest()	1✔
135
136	def to_dict(self, flatten=True) -> Dict[str, Any]:	1✔
137	"""
138	Converts Document into a dictionary.
139
140	`dataframe` and `blob` fields are converted to JSON-serializable types.
141
142	:param flatten:
143	Whether to flatten `meta` field or not. Defaults to `True` to be backward-compatible with Haystack 1.x.
144	"""
145	data = asdict(self)	1✔
146	if (dataframe := data.get("dataframe")) is not None:	1✔
147	data["dataframe"] = dataframe.to_json()	1✔
148	if (blob := data.get("blob")) is not None:	1✔
149	data["blob"] = {"data": list(blob["data"]), "mime_type": blob["mime_type"]}	1✔
150
151	if flatten:	1✔
152	meta = data.pop("meta")	1✔
153	return {data, meta}	1✔
154
155	return data	1✔
156
157	@classmethod	1✔
158	def from_dict(cls, data: Dict[str, Any]) -> "Document":	1✔
159	"""
160	Creates a new Document object from a dictionary.
161
162	The `dataframe` and `blob` fields are converted to their original types.
163	"""
164	if (dataframe := data.get("dataframe")) is not None:	1✔
165	data["dataframe"] = read_json(io.StringIO(dataframe))	1✔
166	if blob := data.get("blob"):	1✔
167	data["blob"] = ByteStream(data=bytes(blob["data"]), mime_type=blob["mime_type"])	1✔
168	if sparse_embedding := data.get("sparse_embedding"):	1✔
169	data["sparse_embedding"] = SparseEmbedding.from_dict(sparse_embedding)	1✔
170
171	# Store metadata for a moment while we try un-flattening allegedly flatten metadata.
172	# We don't expect both a `meta=` keyword and flatten metadata keys so we'll raise a
173	# ValueError later if this is the case.
174	meta = data.pop("meta", {})	1✔
175	# Unflatten metadata if it was flattened. We assume any keyword argument that's not
176	# a document field is a metadata key. We treat legacy fields as document fields
177	# for backward compatibility.
178	flatten_meta = {}	1✔
179	legacy_fields = ["content_type", "id_hash_keys"]	1✔
180	document_fields = legacy_fields + [f.name for f in fields(cls)]	1✔
181	for key in list(data.keys()):	1✔
182	if key not in document_fields:	1✔
183	flatten_meta[key] = data.pop(key)	1✔
184
185	# We don't support passing both flatten keys and the `meta` keyword parameter
186	if meta and flatten_meta:	1✔
187	raise ValueError(	1✔
188	"You can pass either the 'meta' parameter or flattened metadata keys as keyword arguments, "
189	"but currently you're passing both. Pass either the 'meta' parameter or flattened metadata keys."
190	)
191
192	# Finally put back all the metadata
193	return cls(data, meta={meta, **flatten_meta})	1✔
194
195	@property	1✔
196	def content_type(self):	1✔
197	"""
198	Returns the type of the content for the document.
199
200	This is necessary to keep backward compatibility with 1.x.
201
202	:raises ValueError:
203	If both `text` and `dataframe` fields are set or both are missing.
204	"""
205	if self.content is not None and self.dataframe is not None:	1✔
206	raise ValueError("Both text and dataframe are set.")	1✔
207
208	if self.content is not None:	1✔
209	return "text"	1✔
210	elif self.dataframe is not None:	1✔
211	return "table"	1✔
212	raise ValueError("Neither text nor dataframe is set.")	1✔

deepset-ai / haystack / 13074273511

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous