• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 13652289639

04 Mar 2025 11:06AM UTC coverage: 90.216% (+0.1%) from 90.081%
13652289639

push

github

web-flow
refactor!: remove `dataframe` field from `Document` and `ExtractedTableAnswer`; make `pandas` optional (#8906)

* remove dataframe

* release note

* small fix

* group imports

* Update pyproject.toml

Co-authored-by: Julian Risch <julian.risch@deepset.ai>

* Update pyproject.toml

Co-authored-by: Julian Risch <julian.risch@deepset.ai>

* address feedback

---------

Co-authored-by: Julian Risch <julian.risch@deepset.ai>

9599 of 10640 relevant lines covered (90.22%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.33
haystack/dataclasses/document.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
import hashlib
1✔
6
from dataclasses import asdict, dataclass, field, fields
1✔
7
from typing import Any, Dict, List, Optional
1✔
8

9
from numpy import ndarray
1✔
10

11
from haystack import logging
1✔
12
from haystack.dataclasses.byte_stream import ByteStream
1✔
13
from haystack.dataclasses.sparse_embedding import SparseEmbedding
1✔
14

15
logger = logging.getLogger(__name__)
1✔
16

17

18
class _BackwardCompatible(type):
1✔
19
    """
20
    Metaclass that handles Document backward compatibility.
21
    """
22

23
    def __call__(cls, *args, **kwargs):
1✔
24
        """
25
        Called before Document.__init__, will remap legacy fields to new ones.
26

27
        Also handles building a Document from a flattened dictionary.
28
        Dataframe is not supported anymore.
29
        """
30
        ### Conversion from 1.x Document ###
31
        content = kwargs.get("content")
1✔
32
        if content and not isinstance(content, str):
1✔
33
            raise ValueError("The `content` field must be a string or None.")
×
34

35
        # Not used anymore
36
        if "content_type" in kwargs:
1✔
37
            del kwargs["content_type"]
1✔
38

39
        # Embedding were stored as NumPy arrays in 1.x, so we convert it to the new type
40
        if isinstance(embedding := kwargs.get("embedding"), ndarray):
1✔
41
            kwargs["embedding"] = embedding.tolist()
×
42

43
        # id_hash_keys is not used anymore
44
        if "id_hash_keys" in kwargs:
1✔
45
            del kwargs["id_hash_keys"]
1✔
46

47
        return super().__call__(*args, **kwargs)
1✔
48

49

50
@dataclass
1✔
51
class Document(metaclass=_BackwardCompatible):
1✔
52
    """
53
    Base data class containing some data to be queried.
54

55
    Can contain text snippets and file paths to images or audios. Documents can be sorted by score and saved
56
    to/from dictionary and JSON.
57

58
    :param id: Unique identifier for the document. When not set, it's generated based on the Document fields' values.
59
    :param content: Text of the document, if the document contains text.
60
    :param blob: Binary data associated with the document, if the document has any binary data associated with it.
61
    :param meta: Additional custom metadata for the document. Must be JSON-serializable.
62
    :param score: Score of the document. Used for ranking, usually assigned by retrievers.
63
    :param embedding: dense vector representation of the document.
64
    :param sparse_embedding: sparse vector representation of the document.
65
    """
66

67
    id: str = field(default="")
1✔
68
    content: Optional[str] = field(default=None)
1✔
69
    blob: Optional[ByteStream] = field(default=None)
1✔
70
    meta: Dict[str, Any] = field(default_factory=dict)
1✔
71
    score: Optional[float] = field(default=None)
1✔
72
    embedding: Optional[List[float]] = field(default=None)
1✔
73
    sparse_embedding: Optional[SparseEmbedding] = field(default=None)
1✔
74

75
    def __repr__(self):
1✔
76
        fields = []
1✔
77
        if self.content is not None:
1✔
78
            fields.append(
1✔
79
                f"content: '{self.content}'" if len(self.content) < 100 else f"content: '{self.content[:100]}...'"
80
            )
81
        if self.blob is not None:
1✔
82
            fields.append(f"blob: {len(self.blob.data)} bytes")
1✔
83
        if len(self.meta) > 0:
1✔
84
            fields.append(f"meta: {self.meta}")
×
85
        if self.score is not None:
1✔
86
            fields.append(f"score: {self.score}")
×
87
        if self.embedding is not None:
1✔
88
            fields.append(f"embedding: vector of size {len(self.embedding)}")
×
89
        if self.sparse_embedding is not None:
1✔
90
            fields.append(f"sparse_embedding: vector with {len(self.sparse_embedding.indices)} non-zero elements")
×
91
        fields_str = ", ".join(fields)
1✔
92
        return f"{self.__class__.__name__}(id={self.id}, {fields_str})"
1✔
93

94
    def __eq__(self, other):
1✔
95
        """
96
        Compares Documents for equality.
97

98
        Two Documents are considered equals if their dictionary representation is identical.
99
        """
100
        if type(self) != type(other):
1✔
101
            return False
1✔
102
        return self.to_dict() == other.to_dict()
1✔
103

104
    def __post_init__(self):
1✔
105
        """
106
        Generate the ID based on the init parameters.
107
        """
108
        # Generate an id only if not explicitly set
109
        self.id = self.id or self._create_id()
1✔
110

111
    def _create_id(self):
1✔
112
        """
113
        Creates a hash of the given content that acts as the document's ID.
114
        """
115
        text = self.content or None
1✔
116
        dataframe = None  # this allows the ID creation to remain unchanged even if the dataframe field has been removed
1✔
117
        blob = self.blob.data if self.blob is not None else None
1✔
118
        mime_type = self.blob.mime_type if self.blob is not None else None
1✔
119
        meta = self.meta or {}
1✔
120
        embedding = self.embedding if self.embedding is not None else None
1✔
121
        sparse_embedding = self.sparse_embedding.to_dict() if self.sparse_embedding is not None else ""
1✔
122
        data = f"{text}{dataframe}{blob}{mime_type}{meta}{embedding}{sparse_embedding}"
1✔
123
        return hashlib.sha256(data.encode("utf-8")).hexdigest()
1✔
124

125
    def to_dict(self, flatten=True) -> Dict[str, Any]:
1✔
126
        """
127
        Converts Document into a dictionary.
128

129
        `blob` field is converted to a JSON-serializable type.
130

131
        :param flatten:
132
            Whether to flatten `meta` field or not. Defaults to `True` to be backward-compatible with Haystack 1.x.
133
        """
134
        data = asdict(self)
1✔
135
        if (blob := data.get("blob")) is not None:
1✔
136
            data["blob"] = {"data": list(blob["data"]), "mime_type": blob["mime_type"]}
1✔
137

138
        if flatten:
1✔
139
            meta = data.pop("meta")
1✔
140
            return {**data, **meta}
1✔
141

142
        return data
1✔
143

144
    @classmethod
1✔
145
    def from_dict(cls, data: Dict[str, Any]) -> "Document":
1✔
146
        """
147
        Creates a new Document object from a dictionary.
148

149
        The `blob` field is converted to its original type.
150
        """
151
        if blob := data.get("blob"):
1✔
152
            data["blob"] = ByteStream(data=bytes(blob["data"]), mime_type=blob["mime_type"])
1✔
153
        if sparse_embedding := data.get("sparse_embedding"):
1✔
154
            data["sparse_embedding"] = SparseEmbedding.from_dict(sparse_embedding)
1✔
155

156
        # Store metadata for a moment while we try un-flattening allegedly flatten metadata.
157
        # We don't expect both a `meta=` keyword and flatten metadata keys so we'll raise a
158
        # ValueError later if this is the case.
159
        meta = data.pop("meta", {})
1✔
160
        # Unflatten metadata if it was flattened. We assume any keyword argument that's not
161
        # a document field is a metadata key. We treat legacy fields as document fields
162
        # for backward compatibility.
163
        flatten_meta = {}
1✔
164
        legacy_fields = ["content_type", "id_hash_keys"]
1✔
165
        document_fields = legacy_fields + [f.name for f in fields(cls)]
1✔
166
        for key in list(data.keys()):
1✔
167
            if key not in document_fields:
1✔
168
                flatten_meta[key] = data.pop(key)
1✔
169

170
        # We don't support passing both flatten keys and the `meta` keyword parameter
171
        if meta and flatten_meta:
1✔
172
            raise ValueError(
1✔
173
                "You can pass either the 'meta' parameter or flattened metadata keys as keyword arguments, "
174
                "but currently you're passing both. Pass either the 'meta' parameter or flattened metadata keys."
175
            )
176

177
        # Finally put back all the metadata
178
        return cls(**data, meta={**meta, **flatten_meta})
1✔
179

180
    @property
1✔
181
    def content_type(self):
1✔
182
        """
183
        Returns the type of the content for the document.
184

185
        This is necessary to keep backward compatibility with 1.x.
186
        """
187
        if self.content is not None:
1✔
188
            return "text"
1✔
189
        raise ValueError("Content is not set.")
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc