• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 15848832158

24 Jun 2025 11:10AM UTC coverage: 90.198% (+0.02%) from 90.182%
15848832158

push

github

web-flow
fix: Update the de/serialization with schema utils (#9526)

* Update the util methods

* Update tests

* fix tests

* schema fix

* Add json schema for tuples and sets

* Add proper conversion for sets and tuples

* Adjust typing

* PR comments

* Linting

* Optimize deserialization

* remove TODO

* PR comments

* PR comments

* Update tests and deserialization error

* Support legacy deserialization

* Update deprecating warning

* Update test

11613 of 12875 relevant lines covered (90.2%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

89.09
haystack/utils/base_serialization.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
import warnings
1✔
6
from typing import Any, Dict
1✔
7

8
from haystack.core.errors import DeserializationError, SerializationError
1✔
9
from haystack.core.serialization import generate_qualified_class_name, import_class_by_name
1✔
10

11

12
def serialize_class_instance(obj: Any) -> Dict[str, Any]:
1✔
13
    """
14
    Serializes an object that has a `to_dict` method into a dictionary.
15

16
    :param obj:
17
        The object to be serialized.
18
    :returns:
19
        A dictionary representation of the object.
20
    :raises SerializationError:
21
        If the object does not have a `to_dict` method.
22
    """
23
    if not hasattr(obj, "to_dict"):
1✔
24
        raise SerializationError(f"Object of class '{type(obj).__name__}' does not have a 'to_dict' method")
1✔
25

26
    output = obj.to_dict()
1✔
27
    return {"type": generate_qualified_class_name(type(obj)), "data": output}
1✔
28

29

30
def deserialize_class_instance(data: Dict[str, Any]) -> Any:
1✔
31
    """
32
    Deserializes an object from a dictionary representation generated by `auto_serialize_class_instance`.
33

34
    :param data:
35
        The dictionary to deserialize from.
36
    :returns:
37
        The deserialized object.
38
    :raises DeserializationError:
39
        If the serialization data is malformed, the class type cannot be imported, or the
40
        class does not have a `from_dict` method.
41
    """
42
    if "type" not in data:
1✔
43
        raise DeserializationError("Missing 'type' in serialization data")
1✔
44
    if "data" not in data:
1✔
45
        raise DeserializationError("Missing 'data' in serialization data")
1✔
46

47
    try:
1✔
48
        obj_class = import_class_by_name(data["type"])
1✔
49
    except ImportError as e:
1✔
50
        raise DeserializationError(f"Class '{data['type']}' not correctly imported") from e
1✔
51

52
    if not hasattr(obj_class, "from_dict"):
1✔
53
        raise DeserializationError(f"Class '{data['type']}' does not have a 'from_dict' method")
1✔
54

55
    return obj_class.from_dict(data["data"])
1✔
56

57

58
def _serialize_value_with_schema(payload: Any) -> Dict[str, Any]:
1✔
59
    """
60
    Serializes a value into a schema-aware format suitable for storage or transmission.
61

62
    The output format separates the schema information from the actual data, making it easier
63
    to deserialize complex nested structures correctly.
64

65
    The function handles:
66
    - Objects with to_dict() methods (e.g. dataclasses)
67
    - Objects with __dict__ attributes
68
    - Dictionaries
69
    - Lists, tuples, and sets. Lists with mixed types are not supported.
70
    - Primitive types (str, int, float, bool, None)
71

72
    :param payload: The value to serialize (can be any type)
73
    :returns: The serialized dict representation of the given value. Contains two keys:
74
        - "serialization_schema": Contains type information for each field.
75
        - "serialized_data": Contains the actual data in a simplified format.
76

77
    """
78
    # Handle dictionary case - iterate through fields
79
    if isinstance(payload, dict):
1✔
80
        schema: Dict[str, Any] = {}
1✔
81
        data: Dict[str, Any] = {}
1✔
82

83
        for field, val in payload.items():
1✔
84
            # Recursively serialize each field
85
            serialized_value = _serialize_value_with_schema(val)
1✔
86
            schema[field] = serialized_value["serialization_schema"]
1✔
87
            data[field] = serialized_value["serialized_data"]
1✔
88

89
        return {"serialization_schema": {"type": "object", "properties": schema}, "serialized_data": data}
1✔
90

91
    # Handle array case - iterate through elements
92
    elif isinstance(payload, (list, tuple, set)):
1✔
93
        # Convert to list for consistent handling
94
        pure_list = _convert_to_basic_types(list(payload))
1✔
95

96
        # Determine item type from first element (if any)
97
        if payload:
1✔
98
            first = next(iter(payload))
1✔
99
            item_schema = _serialize_value_with_schema(first)
1✔
100
            base_schema = {"type": "array", "items": item_schema["serialization_schema"]}
1✔
101
        else:
102
            base_schema = {"type": "array", "items": {}}
1✔
103

104
        # Add JSON Schema properties to infer sets and tuples
105
        if isinstance(payload, set):
1✔
106
            base_schema["uniqueItems"] = True
1✔
107
        elif isinstance(payload, tuple):
1✔
108
            base_schema["minItems"] = len(payload)
1✔
109
            base_schema["maxItems"] = len(payload)
1✔
110

111
        return {"serialization_schema": base_schema, "serialized_data": pure_list}
1✔
112

113
    # Handle Haystack style objects (e.g. dataclasses and Components)
114
    elif hasattr(payload, "to_dict") and callable(payload.to_dict):
1✔
115
        type_name = generate_qualified_class_name(type(payload))
1✔
116
        pure = _convert_to_basic_types(payload)
1✔
117
        schema = {"type": type_name}
1✔
118
        return {"serialization_schema": schema, "serialized_data": pure}
1✔
119

120
    # Handle arbitrary objects with __dict__
121
    elif hasattr(payload, "__dict__"):
1✔
122
        type_name = generate_qualified_class_name(type(payload))
×
123
        pure = _convert_to_basic_types(vars(payload))
×
124
        schema = {"type": type_name}
×
125
        return {"serialization_schema": schema, "serialized_data": pure}
×
126

127
    # Handle primitives
128
    else:
129
        prim_type = _primitive_schema_type(payload)
1✔
130
        schema = {"type": prim_type}
1✔
131
        return {"serialization_schema": schema, "serialized_data": payload}
1✔
132

133

134
def _primitive_schema_type(value: Any) -> str:
1✔
135
    """
136
    Helper function to determine the schema type for primitive values.
137
    """
138
    if value is None:
1✔
139
        return "null"
1✔
140
    if isinstance(value, bool):
1✔
141
        return "boolean"
1✔
142
    if isinstance(value, int):
1✔
143
        return "integer"
1✔
144
    if isinstance(value, float):
1✔
145
        return "number"
×
146
    if isinstance(value, str):
1✔
147
        return "string"
1✔
148
    return "string"  # fallback
×
149

150

151
def _convert_to_basic_types(value: Any) -> Any:
1✔
152
    """
153
    Helper function to recursively convert complex Python objects into their basic type equivalents.
154

155
    This helper function traverses through nested data structures and converts all complex
156
    objects (custom classes, dataclasses, etc.) into basic Python types (dict, list, str,
157
    int, float, bool, None) that can be easily serialized.
158

159
    The function handles:
160
    - Objects with to_dict() methods: converted using their to_dict implementation
161
    - Objects with __dict__ attribute: converted to plain dictionaries
162
    - Dictionaries: recursively converted values while preserving keys
163
    - Sequences (list, tuple, set): recursively converted while preserving type
164
    - Primitive types: returned as-is
165

166
    """
167
    # dataclass‐style objects
168
    if hasattr(value, "to_dict") and callable(value.to_dict):
1✔
169
        return _convert_to_basic_types(value.to_dict())
1✔
170

171
    # arbitrary objects with __dict__
172
    if hasattr(value, "__dict__"):
1✔
173
        return {k: _convert_to_basic_types(v) for k, v in vars(value).items()}
×
174

175
    # dicts
176
    if isinstance(value, dict):
1✔
177
        return {k: _convert_to_basic_types(v) for k, v in value.items()}
1✔
178

179
    # sequences
180
    if isinstance(value, (list, tuple, set)):
1✔
181
        return [_convert_to_basic_types(v) for v in value]
1✔
182

183
    # primitive
184
    return value
1✔
185

186

187
def _deserialize_value_with_schema(serialized: Dict[str, Any]) -> Any:  # pylint: disable=too-many-return-statements, # noqa: PLR0911, PLR0912
1✔
188
    """
189
    Deserializes a value with schema information back to its original form.
190

191
    Takes a dict of the form:
192
      {
193
         "serialization_schema": {"type": "integer"} or {"type": "object", "properties": {...}},
194
         "serialized_data": <the actual data>
195
      }
196

197
    :param serialized: The serialized dict with schema and data.
198
    :returns: The deserialized value in its original form.
199
    """
200

201
    if not serialized or "serialization_schema" not in serialized or "serialized_data" not in serialized:
1✔
202
        raise DeserializationError(
×
203
            f"Invalid format of passed serialized payload. Expected a dictionary with keys "
204
            f"'serialization_schema' and 'serialized_data'. Got: {serialized}"
205
        )
206
    schema = serialized["serialization_schema"]
1✔
207
    data = serialized["serialized_data"]
1✔
208

209
    schema_type = schema.get("type")
1✔
210

211
    if not schema_type:
1✔
212
        # for backward comaptability till Haystack 2.16 we use legacy implementation
213
        warnings.warn(
1✔
214
            "Missing 'type' key in 'serialization_schema'. This likely indicates that you're using a serialized "
215
            "State object created with a version of Haystack older than 2.15.0. "
216
            "Support for the old serialization format will be removed in Haystack 2.16.0. "
217
            "Please upgrade to the new serialization format to ensure forward compatibility.",
218
            DeprecationWarning,
219
        )
220
        return _deserialize_value_with_schema_legacy(serialized)
1✔
221

222
    # Handle object case (dictionary with properties)
223
    if schema_type == "object":
1✔
224
        properties = schema.get("properties")
1✔
225
        if properties:
1✔
226
            result: Dict[str, Any] = {}
1✔
227

228
            if isinstance(data, dict):
1✔
229
                for field, raw_value in data.items():
1✔
230
                    field_schema = properties.get(field)
1✔
231
                    if field_schema:
1✔
232
                        # Recursively deserialize each field - avoid creating temporary dict
233
                        result[field] = _deserialize_value_with_schema(
1✔
234
                            {"serialization_schema": field_schema, "serialized_data": raw_value}
235
                        )
236

237
            return result
1✔
238
        else:
239
            return _deserialize_value(data)
1✔
240

241
    # Handle array case
242
    elif schema_type == "array":
1✔
243
        # Cache frequently accessed schema properties
244
        item_schema = schema.get("items", {})
1✔
245
        item_type = item_schema.get("type", "any")
1✔
246
        is_set = schema.get("uniqueItems") is True
1✔
247
        is_tuple = schema.get("minItems") is not None and schema.get("maxItems") is not None
1✔
248

249
        # Handle nested objects/arrays first (most complex case)
250
        if item_type in ("object", "array"):
1✔
251
            return [
1✔
252
                _deserialize_value_with_schema({"serialization_schema": item_schema, "serialized_data": item})
253
                for item in data
254
            ]
255

256
        # Helper function to deserialize individual items
257
        def deserialize_item(item):
1✔
258
            if item_type == "any":
1✔
259
                return _deserialize_value(item)
×
260
            else:
261
                return _deserialize_value({"type": item_type, "data": item})
1✔
262

263
        # Handle different collection types
264
        if is_set:
1✔
265
            return {deserialize_item(item) for item in data}
1✔
266
        elif is_tuple:
1✔
267
            return tuple(deserialize_item(item) for item in data)
1✔
268
        else:
269
            return [deserialize_item(item) for item in data]
1✔
270

271
    # Handle primitive types
272
    elif schema_type in ("null", "boolean", "integer", "number", "string"):
1✔
273
        return data
1✔
274

275
    # Handle custom class types
276
    else:
277
        return _deserialize_value({"type": schema_type, "data": data})
1✔
278

279

280
def _deserialize_value(value: Any) -> Any:  # pylint: disable=too-many-return-statements # noqa: PLR0911
1✔
281
    """
282
    Helper function to deserialize values from their envelope format {"type": T, "data": D}.
283

284
    Handles four cases:
285
    - Typed envelopes: {"type": T, "data": D} where T determines deserialization method
286
    - Plain dicts: recursively deserialize values
287
    - Collections (list/tuple/set): recursively deserialize elements
288
    - Other values: return as-is
289

290
    :param value: The value to deserialize
291
    :returns: The deserialized value
292

293
    """
294
    # 1) Envelope case
295
    if isinstance(value, dict) and "type" in value and "data" in value:
1✔
296
        t = value["type"]
1✔
297
        payload = value["data"]
1✔
298

299
        # 1.a) Array
300
        if t == "array":
1✔
301
            return [_deserialize_value(child) for child in payload]
×
302

303
        # 1.b) Generic object/dict
304
        if t == "object":
1✔
305
            return {k: _deserialize_value(v) for k, v in payload.items()}
1✔
306

307
        # 1.c) Primitive
308
        if t in ("null", "boolean", "integer", "number", "string"):
1✔
309
            return payload
1✔
310

311
        # 1.d) Custom class
312
        cls = import_class_by_name(t)
1✔
313
        # first, recursively deserialize the inner payload
314
        deserialized_payload = {k: _deserialize_value(v) for k, v in payload.items()}
1✔
315
        # try from_dict
316
        if hasattr(cls, "from_dict") and callable(cls.from_dict):
1✔
317
            return cls.from_dict(deserialized_payload)
1✔
318
        # fallback: set attributes on a blank instance
319
        instance = cls.__new__(cls)
×
320
        for attr_name, attr_value in deserialized_payload.items():
×
321
            setattr(instance, attr_name, attr_value)
×
322
        return instance
×
323

324
    # 2) Plain dict (no envelope) → recurse
325
    if isinstance(value, dict):
1✔
326
        return {k: _deserialize_value(v) for k, v in value.items()}
1✔
327

328
    # 3) Collections → recurse
329
    if isinstance(value, (list, tuple, set)):
1✔
330
        return type(value)(_deserialize_value(v) for v in value)
1✔
331

332
    # 4) Fallback (shouldn't usually happen with our schema)
333
    return value
1✔
334

335

336
def _deserialize_value_with_schema_legacy(serialized: Dict[str, Any]) -> Dict[str, Any]:
1✔
337
    """
338
    Legacy function for deserializing a dictionary with schema information and data to original values.
339

340
    Kept for backward compatibility till Haystack 2.16.0.
341
    Takes a dict of the form:
342
      {
343
         "schema": {
344
            "numbers": {"type": "integer"},
345
            "messages": {"type": "array", "items": {"type": "haystack.dataclasses.chat_message.ChatMessage"}},
346
        },
347
        "data": {
348
            "numbers": 1,
349
            "messages": [{"role": "user", "meta": {}, "name": None, "content": [{"text": "Hello, world!"}]}],
350
      }
351

352
    :param serialized: The serialized dict with schema and data.
353
    :returns: The deserialized dict with original values.
354
    """
355
    schema = serialized.get("serialization_schema", {})
1✔
356
    data = serialized.get("serialized_data", {})
1✔
357

358
    result: Dict[str, Any] = {}
1✔
359
    for field, raw in data.items():
1✔
360
        info = schema.get(field)
1✔
361
        # no schema entry → just deep-deserialize whatever we have
362
        if not info:
1✔
363
            result[field] = _deserialize_value(raw)
×
364
            continue
×
365

366
        t = info["type"]
1✔
367

368
        # ARRAY case
369
        if t == "array":
1✔
370
            item_type = info["items"]["type"]
1✔
371
            reconstructed = []
1✔
372
            for item in raw:
1✔
373
                envelope = {"type": item_type, "data": item}
1✔
374
                reconstructed.append(_deserialize_value(envelope))
1✔
375
            result[field] = reconstructed
1✔
376

377
        # PRIMITIVE case
378
        elif t in ("null", "boolean", "integer", "number", "string"):
1✔
379
            result[field] = raw
1✔
380

381
        # GENERIC OBJECT
382
        elif t == "object":
1✔
383
            envelope = {"type": "object", "data": raw}
1✔
384
            result[field] = _deserialize_value(envelope)
1✔
385

386
        # CUSTOM CLASS
387
        else:
388
            envelope = {"type": t, "data": raw}
×
389
            result[field] = _deserialize_value(envelope)
×
390

391
    return result
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc