• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 13074273511

31 Jan 2025 02:03PM UTC coverage: 91.359% (+0.007%) from 91.352%
13074273511

push

github

web-flow
deprecate dataframe and ExtractedTableAnswer (#8789)

8871 of 9710 relevant lines covered (91.36%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.52
haystack/dataclasses/document.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
import hashlib
1✔
6
import io
1✔
7
import warnings
1✔
8
from dataclasses import asdict, dataclass, field, fields
1✔
9
from typing import Any, Dict, List, Optional
1✔
10

11
from numpy import ndarray
1✔
12
from pandas import DataFrame, read_json
1✔
13

14
from haystack import logging
1✔
15
from haystack.dataclasses.byte_stream import ByteStream
1✔
16
from haystack.dataclasses.sparse_embedding import SparseEmbedding
1✔
17

18
logger = logging.getLogger(__name__)
1✔
19

20

21
class _BackwardCompatible(type):
1✔
22
    """
23
    Metaclass that handles Document backward compatibility.
24
    """
25

26
    def __call__(cls, *args, **kwargs):
1✔
27
        """
28
        Called before Document.__init__, will remap legacy fields to new ones.
29

30
        Also handles building a Document from a flattened dictionary.
31
        """
32
        # Move `content` to new fields depending on the type
33
        content = kwargs.get("content")
1✔
34
        if isinstance(content, DataFrame):
1✔
35
            kwargs["dataframe"] = content
×
36
            del kwargs["content"]
×
37

38
        # Not used anymore
39
        if "content_type" in kwargs:
1✔
40
            del kwargs["content_type"]
1✔
41

42
        # Embedding were stored as NumPy arrays in 1.x, so we convert it to the new type
43
        if isinstance(embedding := kwargs.get("embedding"), ndarray):
1✔
44
            kwargs["embedding"] = embedding.tolist()
×
45

46
        # id_hash_keys is not used anymore
47
        if "id_hash_keys" in kwargs:
1✔
48
            del kwargs["id_hash_keys"]
1✔
49

50
        return super().__call__(*args, **kwargs)
1✔
51

52

53
@dataclass
1✔
54
class Document(metaclass=_BackwardCompatible):
1✔
55
    """
56
    Base data class containing some data to be queried.
57

58
    Can contain text snippets, tables, and file paths to images or audios. Documents can be sorted by score and saved
59
    to/from dictionary and JSON.
60

61
    :param id: Unique identifier for the document. When not set, it's generated based on the Document fields' values.
62
    :param content: Text of the document, if the document contains text.
63
    :param dataframe: Pandas dataframe with the document's content, if the document contains tabular data.
64
    :param blob: Binary data associated with the document, if the document has any binary data associated with it.
65
    :param meta: Additional custom metadata for the document. Must be JSON-serializable.
66
    :param score: Score of the document. Used for ranking, usually assigned by retrievers.
67
    :param embedding: dense vector representation of the document.
68
    :param sparse_embedding: sparse vector representation of the document.
69
    """
70

71
    id: str = field(default="")
1✔
72
    content: Optional[str] = field(default=None)
1✔
73
    dataframe: Optional[DataFrame] = field(default=None)
1✔
74
    blob: Optional[ByteStream] = field(default=None)
1✔
75
    meta: Dict[str, Any] = field(default_factory=dict)
1✔
76
    score: Optional[float] = field(default=None)
1✔
77
    embedding: Optional[List[float]] = field(default=None)
1✔
78
    sparse_embedding: Optional[SparseEmbedding] = field(default=None)
1✔
79

80
    def __repr__(self):
1✔
81
        fields = []
1✔
82
        if self.content is not None:
1✔
83
            fields.append(
1✔
84
                f"content: '{self.content}'" if len(self.content) < 100 else f"content: '{self.content[:100]}...'"
85
            )
86
        if self.dataframe is not None:
1✔
87
            fields.append(f"dataframe: {self.dataframe.shape}")
1✔
88
        if self.blob is not None:
1✔
89
            fields.append(f"blob: {len(self.blob.data)} bytes")
1✔
90
        if len(self.meta) > 0:
1✔
91
            fields.append(f"meta: {self.meta}")
×
92
        if self.score is not None:
1✔
93
            fields.append(f"score: {self.score}")
×
94
        if self.embedding is not None:
1✔
95
            fields.append(f"embedding: vector of size {len(self.embedding)}")
×
96
        if self.sparse_embedding is not None:
1✔
97
            fields.append(f"sparse_embedding: vector with {len(self.sparse_embedding.indices)} non-zero elements")
×
98
        fields_str = ", ".join(fields)
1✔
99
        return f"{self.__class__.__name__}(id={self.id}, {fields_str})"
1✔
100

101
    def __eq__(self, other):
1✔
102
        """
103
        Compares Documents for equality.
104

105
        Two Documents are considered equals if their dictionary representation is identical.
106
        """
107
        if type(self) != type(other):
1✔
108
            return False
1✔
109
        return self.to_dict() == other.to_dict()
1✔
110

111
    def __post_init__(self):
1✔
112
        """
113
        Generate the ID based on the init parameters.
114
        """
115
        # Generate an id only if not explicitly set
116
        self.id = self.id or self._create_id()
1✔
117

118
        if self.dataframe is not None:
1✔
119
            msg = "The `dataframe` field is deprecated and will be removed in Haystack 2.11.0."
1✔
120
            warnings.warn(msg, DeprecationWarning)
1✔
121

122
    def _create_id(self):
1✔
123
        """
124
        Creates a hash of the given content that acts as the document's ID.
125
        """
126
        text = self.content or None
1✔
127
        dataframe = self.dataframe.to_json() if self.dataframe is not None else None
1✔
128
        blob = self.blob.data if self.blob is not None else None
1✔
129
        mime_type = self.blob.mime_type if self.blob is not None else None
1✔
130
        meta = self.meta or {}
1✔
131
        embedding = self.embedding if self.embedding is not None else None
1✔
132
        sparse_embedding = self.sparse_embedding.to_dict() if self.sparse_embedding is not None else ""
1✔
133
        data = f"{text}{dataframe}{blob}{mime_type}{meta}{embedding}{sparse_embedding}"
1✔
134
        return hashlib.sha256(data.encode("utf-8")).hexdigest()
1✔
135

136
    def to_dict(self, flatten=True) -> Dict[str, Any]:
1✔
137
        """
138
        Converts Document into a dictionary.
139

140
        `dataframe` and `blob` fields are converted to JSON-serializable types.
141

142
        :param flatten:
143
            Whether to flatten `meta` field or not. Defaults to `True` to be backward-compatible with Haystack 1.x.
144
        """
145
        data = asdict(self)
1✔
146
        if (dataframe := data.get("dataframe")) is not None:
1✔
147
            data["dataframe"] = dataframe.to_json()
1✔
148
        if (blob := data.get("blob")) is not None:
1✔
149
            data["blob"] = {"data": list(blob["data"]), "mime_type": blob["mime_type"]}
1✔
150

151
        if flatten:
1✔
152
            meta = data.pop("meta")
1✔
153
            return {**data, **meta}
1✔
154

155
        return data
1✔
156

157
    @classmethod
1✔
158
    def from_dict(cls, data: Dict[str, Any]) -> "Document":
1✔
159
        """
160
        Creates a new Document object from a dictionary.
161

162
        The `dataframe` and `blob` fields are converted to their original types.
163
        """
164
        if (dataframe := data.get("dataframe")) is not None:
1✔
165
            data["dataframe"] = read_json(io.StringIO(dataframe))
1✔
166
        if blob := data.get("blob"):
1✔
167
            data["blob"] = ByteStream(data=bytes(blob["data"]), mime_type=blob["mime_type"])
1✔
168
        if sparse_embedding := data.get("sparse_embedding"):
1✔
169
            data["sparse_embedding"] = SparseEmbedding.from_dict(sparse_embedding)
1✔
170

171
        # Store metadata for a moment while we try un-flattening allegedly flatten metadata.
172
        # We don't expect both a `meta=` keyword and flatten metadata keys so we'll raise a
173
        # ValueError later if this is the case.
174
        meta = data.pop("meta", {})
1✔
175
        # Unflatten metadata if it was flattened. We assume any keyword argument that's not
176
        # a document field is a metadata key. We treat legacy fields as document fields
177
        # for backward compatibility.
178
        flatten_meta = {}
1✔
179
        legacy_fields = ["content_type", "id_hash_keys"]
1✔
180
        document_fields = legacy_fields + [f.name for f in fields(cls)]
1✔
181
        for key in list(data.keys()):
1✔
182
            if key not in document_fields:
1✔
183
                flatten_meta[key] = data.pop(key)
1✔
184

185
        # We don't support passing both flatten keys and the `meta` keyword parameter
186
        if meta and flatten_meta:
1✔
187
            raise ValueError(
1✔
188
                "You can pass either the 'meta' parameter or flattened metadata keys as keyword arguments, "
189
                "but currently you're passing both. Pass either the 'meta' parameter or flattened metadata keys."
190
            )
191

192
        # Finally put back all the metadata
193
        return cls(**data, meta={**meta, **flatten_meta})
1✔
194

195
    @property
1✔
196
    def content_type(self):
1✔
197
        """
198
        Returns the type of the content for the document.
199

200
        This is necessary to keep backward compatibility with 1.x.
201

202
        :raises ValueError:
203
            If both `text` and `dataframe` fields are set or both are missing.
204
        """
205
        if self.content is not None and self.dataframe is not None:
1✔
206
            raise ValueError("Both text and dataframe are set.")
1✔
207

208
        if self.content is not None:
1✔
209
            return "text"
1✔
210
        elif self.dataframe is not None:
1✔
211
            return "table"
1✔
212
        raise ValueError("Neither text nor dataframe is set.")
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc