• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 14064199728

25 Mar 2025 03:52PM UTC coverage: 90.154% (+0.08%) from 90.07%
14064199728

Pull #9055

github

web-flow
Merge eaafb5e56 into e64db6197
Pull Request #9055: Added retries parameters to pipeline.draw()

9898 of 10979 relevant lines covered (90.15%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.02
haystack/dataclasses/document.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
import hashlib
1✔
6
from dataclasses import asdict, dataclass, field, fields
1✔
7
from typing import Any, Dict, List, Optional
1✔
8

9
from numpy import ndarray
1✔
10

11
from haystack.dataclasses.byte_stream import ByteStream
1✔
12
from haystack.dataclasses.sparse_embedding import SparseEmbedding
1✔
13

14
LEGACY_FIELDS = ["content_type", "id_hash_keys", "dataframe"]
1✔
15

16

17
class _BackwardCompatible(type):
1✔
18
    """
19
    Metaclass that handles Document backward compatibility.
20
    """
21

22
    def __call__(cls, *args, **kwargs):
1✔
23
        """
24
        Called before Document.__init__, handles legacy fields.
25

26
        Embedding was stored as NumPy arrays in 1.x, so we convert it to a list of floats.
27
        Other legacy fields are removed.
28
        """
29
        ### Conversion from 1.x Document ###
30
        content = kwargs.get("content")
1✔
31
        if content and not isinstance(content, str):
1✔
32
            raise ValueError("The `content` field must be a string or None.")
×
33

34
        # Embedding were stored as NumPy arrays in 1.x, so we convert it to the new type
35
        if isinstance(embedding := kwargs.get("embedding"), ndarray):
1✔
36
            kwargs["embedding"] = embedding.tolist()
×
37

38
        # Remove legacy fields
39
        for field_name in LEGACY_FIELDS:
1✔
40
            kwargs.pop(field_name, None)
1✔
41

42
        return super().__call__(*args, **kwargs)
1✔
43

44

45
@dataclass
1✔
46
class Document(metaclass=_BackwardCompatible):
1✔
47
    """
48
    Base data class containing some data to be queried.
49

50
    Can contain text snippets and file paths to images or audios. Documents can be sorted by score and saved
51
    to/from dictionary and JSON.
52

53
    :param id: Unique identifier for the document. When not set, it's generated based on the Document fields' values.
54
    :param content: Text of the document, if the document contains text.
55
    :param blob: Binary data associated with the document, if the document has any binary data associated with it.
56
    :param meta: Additional custom metadata for the document. Must be JSON-serializable.
57
    :param score: Score of the document. Used for ranking, usually assigned by retrievers.
58
    :param embedding: dense vector representation of the document.
59
    :param sparse_embedding: sparse vector representation of the document.
60
    """
61

62
    id: str = field(default="")
1✔
63
    content: Optional[str] = field(default=None)
1✔
64
    blob: Optional[ByteStream] = field(default=None)
1✔
65
    meta: Dict[str, Any] = field(default_factory=dict)
1✔
66
    score: Optional[float] = field(default=None)
1✔
67
    embedding: Optional[List[float]] = field(default=None)
1✔
68
    sparse_embedding: Optional[SparseEmbedding] = field(default=None)
1✔
69

70
    def __repr__(self):
1✔
71
        fields = []
1✔
72
        if self.content is not None:
1✔
73
            fields.append(
1✔
74
                f"content: '{self.content}'" if len(self.content) < 100 else f"content: '{self.content[:100]}...'"
75
            )
76
        if self.blob is not None:
1✔
77
            fields.append(f"blob: {len(self.blob.data)} bytes")
1✔
78
        if len(self.meta) > 0:
1✔
79
            fields.append(f"meta: {self.meta}")
×
80
        if self.score is not None:
1✔
81
            fields.append(f"score: {self.score}")
×
82
        if self.embedding is not None:
1✔
83
            fields.append(f"embedding: vector of size {len(self.embedding)}")
×
84
        if self.sparse_embedding is not None:
1✔
85
            fields.append(f"sparse_embedding: vector with {len(self.sparse_embedding.indices)} non-zero elements")
×
86
        fields_str = ", ".join(fields)
1✔
87
        return f"{self.__class__.__name__}(id={self.id}, {fields_str})"
1✔
88

89
    def __eq__(self, other):
1✔
90
        """
91
        Compares Documents for equality.
92

93
        Two Documents are considered equals if their dictionary representation is identical.
94
        """
95
        if type(self) != type(other):
1✔
96
            return False
1✔
97
        return self.to_dict() == other.to_dict()
1✔
98

99
    def __post_init__(self):
1✔
100
        """
101
        Generate the ID based on the init parameters.
102
        """
103
        # Generate an id only if not explicitly set
104
        self.id = self.id or self._create_id()
1✔
105

106
    def _create_id(self):
1✔
107
        """
108
        Creates a hash of the given content that acts as the document's ID.
109
        """
110
        text = self.content or None
1✔
111
        dataframe = None  # this allows the ID creation to remain unchanged even if the dataframe field has been removed
1✔
112
        blob = self.blob.data if self.blob is not None else None
1✔
113
        mime_type = self.blob.mime_type if self.blob is not None else None
1✔
114
        meta = self.meta or {}
1✔
115
        embedding = self.embedding if self.embedding is not None else None
1✔
116
        sparse_embedding = self.sparse_embedding.to_dict() if self.sparse_embedding is not None else ""
1✔
117
        data = f"{text}{dataframe}{blob}{mime_type}{meta}{embedding}{sparse_embedding}"
1✔
118
        return hashlib.sha256(data.encode("utf-8")).hexdigest()
1✔
119

120
    def to_dict(self, flatten=True) -> Dict[str, Any]:
1✔
121
        """
122
        Converts Document into a dictionary.
123

124
        `blob` field is converted to a JSON-serializable type.
125

126
        :param flatten:
127
            Whether to flatten `meta` field or not. Defaults to `True` to be backward-compatible with Haystack 1.x.
128
        """
129
        data = asdict(self)
1✔
130
        if (blob := data.get("blob")) is not None:
1✔
131
            data["blob"] = {"data": list(blob["data"]), "mime_type": blob["mime_type"]}
1✔
132

133
        if flatten:
1✔
134
            meta = data.pop("meta")
1✔
135
            return {**data, **meta}
1✔
136

137
        return data
1✔
138

139
    @classmethod
1✔
140
    def from_dict(cls, data: Dict[str, Any]) -> "Document":
1✔
141
        """
142
        Creates a new Document object from a dictionary.
143

144
        The `blob` field is converted to its original type.
145
        """
146
        if blob := data.get("blob"):
1✔
147
            data["blob"] = ByteStream(data=bytes(blob["data"]), mime_type=blob["mime_type"])
1✔
148
        if sparse_embedding := data.get("sparse_embedding"):
1✔
149
            data["sparse_embedding"] = SparseEmbedding.from_dict(sparse_embedding)
1✔
150

151
        # Store metadata for a moment while we try un-flattening allegedly flatten metadata.
152
        # We don't expect both a `meta=` keyword and flatten metadata keys so we'll raise a
153
        # ValueError later if this is the case.
154
        meta = data.pop("meta", {})
1✔
155
        # Unflatten metadata if it was flattened. We assume any keyword argument that's not
156
        # a document field is a metadata key. We treat legacy fields as document fields
157
        # for backward compatibility.
158
        flatten_meta = {}
1✔
159
        document_fields = LEGACY_FIELDS + [f.name for f in fields(cls)]
1✔
160
        for key in list(data.keys()):
1✔
161
            if key not in document_fields:
1✔
162
                flatten_meta[key] = data.pop(key)
1✔
163

164
        # We don't support passing both flatten keys and the `meta` keyword parameter
165
        if meta and flatten_meta:
1✔
166
            raise ValueError(
1✔
167
                "You can pass either the 'meta' parameter or flattened metadata keys as keyword arguments, "
168
                "but currently you're passing both. Pass either the 'meta' parameter or flattened metadata keys."
169
            )
170

171
        # Finally put back all the metadata
172
        return cls(**data, meta={**meta, **flatten_meta})
1✔
173

174
    @property
1✔
175
    def content_type(self):
1✔
176
        """
177
        Returns the type of the content for the document.
178

179
        This is necessary to keep backward compatibility with 1.x.
180
        """
181
        if self.content is not None:
1✔
182
            return "text"
1✔
183
        raise ValueError("Content is not set.")
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc