• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

deepset-ai / haystack / 16023509185

02 Jul 2025 11:10AM UTC coverage: 90.367% (-0.005%) from 90.372%
16023509185

Pull #9585

github

web-flow
Merge b9c99fe57 into adb2759d0
Pull Request #9585: chore: remove backward compatibility for `State` deserialization

11708 of 12956 relevant lines covered (90.37%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

88.41
haystack/utils/base_serialization.py
1
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2
#
3
# SPDX-License-Identifier: Apache-2.0
4

5
from typing import Any, Dict
1✔
6

7
from haystack.core.errors import DeserializationError, SerializationError
1✔
8
from haystack.core.serialization import generate_qualified_class_name, import_class_by_name
1✔
9

10

11
def serialize_class_instance(obj: Any) -> Dict[str, Any]:
1✔
12
    """
13
    Serializes an object that has a `to_dict` method into a dictionary.
14

15
    :param obj:
16
        The object to be serialized.
17
    :returns:
18
        A dictionary representation of the object.
19
    :raises SerializationError:
20
        If the object does not have a `to_dict` method.
21
    """
22
    if not hasattr(obj, "to_dict"):
1✔
23
        raise SerializationError(f"Object of class '{type(obj).__name__}' does not have a 'to_dict' method")
1✔
24

25
    output = obj.to_dict()
1✔
26
    return {"type": generate_qualified_class_name(type(obj)), "data": output}
1✔
27

28

29
def deserialize_class_instance(data: Dict[str, Any]) -> Any:
1✔
30
    """
31
    Deserializes an object from a dictionary representation generated by `auto_serialize_class_instance`.
32

33
    :param data:
34
        The dictionary to deserialize from.
35
    :returns:
36
        The deserialized object.
37
    :raises DeserializationError:
38
        If the serialization data is malformed, the class type cannot be imported, or the
39
        class does not have a `from_dict` method.
40
    """
41
    if "type" not in data:
1✔
42
        raise DeserializationError("Missing 'type' in serialization data")
1✔
43
    if "data" not in data:
1✔
44
        raise DeserializationError("Missing 'data' in serialization data")
1✔
45

46
    try:
1✔
47
        obj_class = import_class_by_name(data["type"])
1✔
48
    except ImportError as e:
1✔
49
        raise DeserializationError(f"Class '{data['type']}' not correctly imported") from e
1✔
50

51
    if not hasattr(obj_class, "from_dict"):
1✔
52
        raise DeserializationError(f"Class '{data['type']}' does not have a 'from_dict' method")
1✔
53

54
    return obj_class.from_dict(data["data"])
1✔
55

56

57
def _serialize_value_with_schema(payload: Any) -> Dict[str, Any]:
1✔
58
    """
59
    Serializes a value into a schema-aware format suitable for storage or transmission.
60

61
    The output format separates the schema information from the actual data, making it easier
62
    to deserialize complex nested structures correctly.
63

64
    The function handles:
65
    - Objects with to_dict() methods (e.g. dataclasses)
66
    - Objects with __dict__ attributes
67
    - Dictionaries
68
    - Lists, tuples, and sets. Lists with mixed types are not supported.
69
    - Primitive types (str, int, float, bool, None)
70

71
    :param payload: The value to serialize (can be any type)
72
    :returns: The serialized dict representation of the given value. Contains two keys:
73
        - "serialization_schema": Contains type information for each field.
74
        - "serialized_data": Contains the actual data in a simplified format.
75

76
    """
77
    # Handle dictionary case - iterate through fields
78
    if isinstance(payload, dict):
1✔
79
        schema: Dict[str, Any] = {}
1✔
80
        data: Dict[str, Any] = {}
1✔
81

82
        for field, val in payload.items():
1✔
83
            # Recursively serialize each field
84
            serialized_value = _serialize_value_with_schema(val)
1✔
85
            schema[field] = serialized_value["serialization_schema"]
1✔
86
            data[field] = serialized_value["serialized_data"]
1✔
87

88
        return {"serialization_schema": {"type": "object", "properties": schema}, "serialized_data": data}
1✔
89

90
    # Handle array case - iterate through elements
91
    elif isinstance(payload, (list, tuple, set)):
1✔
92
        # Convert to list for consistent handling
93
        pure_list = _convert_to_basic_types(list(payload))
1✔
94

95
        # Determine item type from first element (if any)
96
        if payload:
1✔
97
            first = next(iter(payload))
1✔
98
            item_schema = _serialize_value_with_schema(first)
1✔
99
            base_schema = {"type": "array", "items": item_schema["serialization_schema"]}
1✔
100
        else:
101
            base_schema = {"type": "array", "items": {}}
1✔
102

103
        # Add JSON Schema properties to infer sets and tuples
104
        if isinstance(payload, set):
1✔
105
            base_schema["uniqueItems"] = True
1✔
106
        elif isinstance(payload, tuple):
1✔
107
            base_schema["minItems"] = len(payload)
1✔
108
            base_schema["maxItems"] = len(payload)
1✔
109

110
        return {"serialization_schema": base_schema, "serialized_data": pure_list}
1✔
111

112
    # Handle Haystack style objects (e.g. dataclasses and Components)
113
    elif hasattr(payload, "to_dict") and callable(payload.to_dict):
1✔
114
        type_name = generate_qualified_class_name(type(payload))
1✔
115
        pure = _convert_to_basic_types(payload)
1✔
116
        schema = {"type": type_name}
1✔
117
        return {"serialization_schema": schema, "serialized_data": pure}
1✔
118

119
    # Handle arbitrary objects with __dict__
120
    elif hasattr(payload, "__dict__"):
1✔
121
        type_name = generate_qualified_class_name(type(payload))
×
122
        pure = _convert_to_basic_types(vars(payload))
×
123
        schema = {"type": type_name}
×
124
        return {"serialization_schema": schema, "serialized_data": pure}
×
125

126
    # Handle primitives
127
    else:
128
        prim_type = _primitive_schema_type(payload)
1✔
129
        schema = {"type": prim_type}
1✔
130
        return {"serialization_schema": schema, "serialized_data": payload}
1✔
131

132

133
def _primitive_schema_type(value: Any) -> str:
1✔
134
    """
135
    Helper function to determine the schema type for primitive values.
136
    """
137
    if value is None:
1✔
138
        return "null"
1✔
139
    if isinstance(value, bool):
1✔
140
        return "boolean"
1✔
141
    if isinstance(value, int):
1✔
142
        return "integer"
1✔
143
    if isinstance(value, float):
1✔
144
        return "number"
×
145
    if isinstance(value, str):
1✔
146
        return "string"
1✔
147
    return "string"  # fallback
×
148

149

150
def _convert_to_basic_types(value: Any) -> Any:
1✔
151
    """
152
    Helper function to recursively convert complex Python objects into their basic type equivalents.
153

154
    This helper function traverses through nested data structures and converts all complex
155
    objects (custom classes, dataclasses, etc.) into basic Python types (dict, list, str,
156
    int, float, bool, None) that can be easily serialized.
157

158
    The function handles:
159
    - Objects with to_dict() methods: converted using their to_dict implementation
160
    - Objects with __dict__ attribute: converted to plain dictionaries
161
    - Dictionaries: recursively converted values while preserving keys
162
    - Sequences (list, tuple, set): recursively converted while preserving type
163
    - Primitive types: returned as-is
164

165
    """
166
    # dataclass‐style objects
167
    if hasattr(value, "to_dict") and callable(value.to_dict):
1✔
168
        return _convert_to_basic_types(value.to_dict())
1✔
169

170
    # arbitrary objects with __dict__
171
    if hasattr(value, "__dict__"):
1✔
172
        return {k: _convert_to_basic_types(v) for k, v in vars(value).items()}
×
173

174
    # dicts
175
    if isinstance(value, dict):
1✔
176
        return {k: _convert_to_basic_types(v) for k, v in value.items()}
1✔
177

178
    # sequences
179
    if isinstance(value, (list, tuple, set)):
1✔
180
        return [_convert_to_basic_types(v) for v in value]
1✔
181

182
    # primitive
183
    return value
1✔
184

185

186
def _deserialize_value_with_schema(serialized: Dict[str, Any]) -> Any:  # pylint: disable=too-many-return-statements, # noqa: PLR0911, PLR0912
1✔
187
    """
188
    Deserializes a value with schema information back to its original form.
189

190
    Takes a dict of the form:
191
      {
192
         "serialization_schema": {"type": "integer"} or {"type": "object", "properties": {...}},
193
         "serialized_data": <the actual data>
194
      }
195

196
    :param serialized: The serialized dict with schema and data.
197
    :returns: The deserialized value in its original form.
198
    """
199

200
    if not serialized or "serialization_schema" not in serialized or "serialized_data" not in serialized:
1✔
201
        raise DeserializationError(
×
202
            f"Invalid format of passed serialized payload. Expected a dictionary with keys "
203
            f"'serialization_schema' and 'serialized_data'. Got: {serialized}"
204
        )
205
    schema = serialized["serialization_schema"]
1✔
206
    data = serialized["serialized_data"]
1✔
207

208
    schema_type = schema.get("type")
1✔
209

210
    if not schema_type:
1✔
211
        # for backward comaptability till Haystack 2.16 we use legacy implementation
212
        raise DeserializationError(
×
213
            "Missing 'type' key in 'serialization_schema'. This likely indicates that you're using a serialized "
214
            "State object created with a version of Haystack older than 2.15.0. "
215
            "Support for the old serialization format is removed in Haystack 2.16.0. "
216
            "Please upgrade to the new serialization format to ensure forward compatibility."
217
        )
218

219
    # Handle object case (dictionary with properties)
220
    if schema_type == "object":
1✔
221
        properties = schema.get("properties")
1✔
222
        if properties:
1✔
223
            result: Dict[str, Any] = {}
1✔
224

225
            if isinstance(data, dict):
1✔
226
                for field, raw_value in data.items():
1✔
227
                    field_schema = properties.get(field)
1✔
228
                    if field_schema:
1✔
229
                        # Recursively deserialize each field - avoid creating temporary dict
230
                        result[field] = _deserialize_value_with_schema(
1✔
231
                            {"serialization_schema": field_schema, "serialized_data": raw_value}
232
                        )
233

234
            return result
1✔
235
        else:
236
            return _deserialize_value(data)
1✔
237

238
    # Handle array case
239
    elif schema_type == "array":
1✔
240
        # Cache frequently accessed schema properties
241
        item_schema = schema.get("items", {})
1✔
242
        item_type = item_schema.get("type", "any")
1✔
243
        is_set = schema.get("uniqueItems") is True
1✔
244
        is_tuple = schema.get("minItems") is not None and schema.get("maxItems") is not None
1✔
245

246
        # Handle nested objects/arrays first (most complex case)
247
        if item_type in ("object", "array"):
1✔
248
            return [
1✔
249
                _deserialize_value_with_schema({"serialization_schema": item_schema, "serialized_data": item})
250
                for item in data
251
            ]
252

253
        # Helper function to deserialize individual items
254
        def deserialize_item(item):
1✔
255
            if item_type == "any":
1✔
256
                return _deserialize_value(item)
×
257
            else:
258
                return _deserialize_value({"type": item_type, "data": item})
1✔
259

260
        # Handle different collection types
261
        if is_set:
1✔
262
            return {deserialize_item(item) for item in data}
1✔
263
        elif is_tuple:
1✔
264
            return tuple(deserialize_item(item) for item in data)
1✔
265
        else:
266
            return [deserialize_item(item) for item in data]
1✔
267

268
    # Handle primitive types
269
    elif schema_type in ("null", "boolean", "integer", "number", "string"):
1✔
270
        return data
1✔
271

272
    # Handle custom class types
273
    else:
274
        return _deserialize_value({"type": schema_type, "data": data})
1✔
275

276

277
def _deserialize_value(value: Any) -> Any:  # pylint: disable=too-many-return-statements # noqa: PLR0911
1✔
278
    """
279
    Helper function to deserialize values from their envelope format {"type": T, "data": D}.
280

281
    Handles four cases:
282
    - Typed envelopes: {"type": T, "data": D} where T determines deserialization method
283
    - Plain dicts: recursively deserialize values
284
    - Collections (list/tuple/set): recursively deserialize elements
285
    - Other values: return as-is
286

287
    :param value: The value to deserialize
288
    :returns: The deserialized value
289

290
    """
291
    # 1) Envelope case
292
    if isinstance(value, dict) and "type" in value and "data" in value:
1✔
293
        t = value["type"]
1✔
294
        payload = value["data"]
1✔
295

296
        # 1.a) Array
297
        if t == "array":
1✔
298
            return [_deserialize_value(child) for child in payload]
×
299

300
        # 1.b) Generic object/dict
301
        if t == "object":
1✔
302
            return {k: _deserialize_value(v) for k, v in payload.items()}
×
303

304
        # 1.c) Primitive
305
        if t in ("null", "boolean", "integer", "number", "string"):
1✔
306
            return payload
1✔
307

308
        # 1.d) Custom class
309
        cls = import_class_by_name(t)
1✔
310
        # first, recursively deserialize the inner payload
311
        deserialized_payload = {k: _deserialize_value(v) for k, v in payload.items()}
1✔
312
        # try from_dict
313
        if hasattr(cls, "from_dict") and callable(cls.from_dict):
1✔
314
            return cls.from_dict(deserialized_payload)
1✔
315
        # fallback: set attributes on a blank instance
316
        instance = cls.__new__(cls)
×
317
        for attr_name, attr_value in deserialized_payload.items():
×
318
            setattr(instance, attr_name, attr_value)
×
319
        return instance
×
320

321
    # 2) Plain dict (no envelope) → recurse
322
    if isinstance(value, dict):
1✔
323
        return {k: _deserialize_value(v) for k, v in value.items()}
1✔
324

325
    # 3) Collections → recurse
326
    if isinstance(value, (list, tuple, set)):
1✔
327
        return type(value)(_deserialize_value(v) for v in value)
1✔
328

329
    # 4) Fallback (shouldn't usually happen with our schema)
330
    return value
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc