• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

NaturalHistoryMuseum / splitgill / #89

07 Aug 2024 05:01PM UTC coverage: 94.805%. Remained the same
#89

push

coveralls-python

jrdh
docs: update docs significantly

1022 of 1078 relevant lines covered (94.81%)

0.95 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.23
/splitgill/diffing.py
1
import abc
1✔
2
from collections import deque
1✔
3
from dataclasses import dataclass
1✔
4
from datetime import datetime, date
1✔
5
from itertools import zip_longest
1✔
6
from typing import (
1✔
7
    Iterable,
8
    Tuple,
9
    Any,
10
    Union,
11
    NamedTuple,
12
    Dict,
13
    Deque,
14
    List,
15
    TypeVar,
16
    Generic,
17
    Optional,
18
    Collection,
19
)
20

21
import regex as rx
1✔
22

23
# strftime formats used to turn datetime and date objects into strings before data
24
# enters MongoDB (see prepare_data), these are based on ISO 8601
25
DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%f%z"
1✔
26
DATE_FORMAT = "%Y-%m-%d"
1✔
27
# when we turn a naive datetime into a string using the DATETIME_FORMAT above, %z won't
28
# appear meaning we can't strptime with the same format. This is annoying, so here's a
29
# strptime format that can parse this native result
30
NAIVE_DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%f"
1✔
31

32
# this regex matches invalid characters which we would like to remove from all string
33
# values as they are ingested into the system. It matches unicode control characters
34
# (i.e. category C*) but not \n, \r, or \t).
35
invalid_value_char_regex = rx.compile(r"[^\P{C}\n\r\t]")
1✔
36
# this regex matches invalid characters which we would like to remove from all field
37
# names as they are ingested into the system. It matches all unicode control characters,
38
# so it's a bit stricter than the value regex which allows new lines and tabs
39
invalid_key_char_regex = rx.compile(r"[^\P{C}]")
1✔
40

41

42
def prepare_data(
1✔
43
    value: Any,
44
) -> Union[str, dict, list, int, float, bool, datetime, date, None]:
45
    """
46
    Prepares the given value for storage in MongoDB. Conversions are completed like so:
47

48
        - None values are just returned as is
49
        - str values have invalid characters removed and are then returned. The
50
          characters are currently all unicode control characters except \n, \r, and \t.
51
        - int, float, bool, and None values are returned with no changes made
52
        - datetime and date values are converted to strings using strftime with the
53
          specific formats DATETIME_FORMAT and DATE_FORMAT.
54
        - dict values are returned as a new dict instance, with all the keys converted
55
          to strings and all the values recursively prepared using this function.
56
        - lists, sets, and tuples are converted to lists with each element of the value
57
          prepared by this function.
58

59
    :param value: the value to be stored in MongoDB
60
    :return: None, str, int, float, bool, tuple, or dict depending on the input value
61
    """
62
    if value is None:
1✔
63
        return None
1✔
64

65
    if isinstance(value, str):
1✔
66
        # replace any invalid characters in the string with the empty string
67
        return invalid_value_char_regex.sub("", value)
1✔
68

69
    if isinstance(value, (int, float, bool)):
1✔
70
        return value
1✔
71

72
    if isinstance(value, dict):
1✔
73
        return {
1✔
74
            prepare_field_name(key): prepare_data(value) for key, value in value.items()
75
        }
76

77
    if isinstance(value, (list, set, tuple)):
1✔
78
        return list(map(prepare_data, value))
1✔
79

80
    # check datetime first as datetime is a subclass of date
81
    if isinstance(value, datetime):
1✔
82
        # stringifying this ensures the tz info is recorded and won't change going
83
        # in/out mongo
84
        return value.strftime(DATETIME_FORMAT)
1✔
85

86
    # now check date as we've covered off datetimes
87
    if isinstance(value, date):
1✔
88
        # stringify to simplify handling of dates to mirror datetime pattern even though
89
        # the same timezone issue doesn't exist here
90
        return value.strftime(DATE_FORMAT)
1✔
91

92
    # fallback
93
    return str(value)
1✔
94

95

96
def prepare_field_name(name: Any) -> str:
1✔
97
    """
98
    Cleans up a field name for ingestion into the system. There are a few steps to this:
99

100
        - convert the name to a str, MongoDB only accepts str keys in objects
101
        - remove any control characters from the str
102
        - replace . with _ as Elasticsearch doesn't like dots in keys
103

104
    :param name: the field name
105
    :return: a clean str field name
106
    """
107
    return invalid_key_char_regex.sub("", str(name)).replace(".", "_").strip()
1✔
108

109

110
class DiffOp(NamedTuple):
1✔
111
    """
112
    A namedtuple describing the differences found by the diff function.
113
    """
114

115
    # the path where the changes should be applied at in the root dict
116
    path: Tuple[Union[str, int], ...]
1✔
117
    # a dict of the changes made at that path
118
    ops: Dict[str, Any]
1✔
119

120

121
_T = TypeVar("_T")
1✔
122

123

124
@dataclass
1✔
125
class Comparison(abc.ABC, Generic[_T]):
1✔
126
    """
127
    A comparison between two objects of the same type.
128
    """
129

130
    path: Tuple[Union[str, int], ...]
1✔
131
    left: _T
1✔
132
    right: _T
1✔
133

134
    @abc.abstractmethod
1✔
135
    def compare(self) -> Tuple[Optional[DiffOp], List["Comparison"]]:
1✔
136
        """
137
        Compare the two objects and return a 2-tuple containing a DiffOp and a list of
138
        further comparisons which need to be handled. If no differences are found, then
139
        the first element of the returned 2-tuple will be None.
140

141
        :return: A 2-tuple containing a DiffOp and a list of further Comparison objects
142
        """
143
        pass
×
144

145

146
class DictComparison(Comparison[dict]):
1✔
147
    """
148
    A comparison between two dicts.
149
    """
150

151
    def compare(self) -> Tuple[Optional[DiffOp], List["Comparison"]]:
1✔
152
        """
153
        Compares the two dicts and return a 2-tuple containing a DiffOp and a list of
154
        further comparisons which need to be handled. If no differences are found, then
155
        the first element of the returned 2-tuple will be None.
156

157
        :return: A 2-tuple containing a DiffOp and a list of further Comparison objects
158
        """
159
        missing = object()
1✔
160
        ops = {}
1✔
161
        further_comparisons = []
1✔
162

163
        new_values = {
1✔
164
            key: value for key, value in self.right.items() if key not in self.left
165
        }
166
        if new_values:
1✔
167
            ops["dn"] = new_values
1✔
168

169
        deleted_keys = [key for key in self.left if key not in self.right]
1✔
170
        if deleted_keys:
1✔
171
            ops["dd"] = deleted_keys
1✔
172

173
        changes = {}
1✔
174
        for key, left_value in self.left.items():
1✔
175
            right_value = self.right.get(key, missing)
1✔
176

177
            # deletion or equality, nothing to do
178
            if right_value is missing or left_value == right_value:
1✔
179
                continue
1✔
180

181
            # check for nested container objects and add Comparison objects to the list
182
            # if any are found of the same types
183
            if isinstance(left_value, dict) and isinstance(right_value, dict):
1✔
184
                further_comparisons.append(
1✔
185
                    DictComparison((*self.path, key), left_value, right_value)
186
                )
187
            elif isinstance(left_value, list) and isinstance(right_value, list):
1✔
188
                further_comparisons.append(
1✔
189
                    ListComparison((*self.path, key), left_value, right_value)
190
                )
191
            else:
192
                changes[key] = right_value
1✔
193
        if changes:
1✔
194
            ops["dc"] = changes
1✔
195

196
        return DiffOp(self.path, ops) if ops else None, further_comparisons
1✔
197

198

199
@dataclass
1✔
200
class ListComparison(Comparison[list]):
1✔
201
    """
202
    A comparison between two lists.
203
    """
204

205
    def compare(self) -> Tuple[DiffOp, List["Comparison"]]:
1✔
206
        """
207
        Compares the two lists and return a 2-tuple containing a DiffOp and a list of
208
        further comparisons which need to be handled. If no differences are found, then
209
        the first element of the returned 2-tuple will be None.
210

211
        :return: A 2-tuple containing a DiffOp and a list of further Comparison objects
212
        """
213
        missing = object()
1✔
214
        ops = {}
1✔
215
        further_comparisons = []
1✔
216

217
        changes = []
1✔
218
        for index, (left_value, right_value) in enumerate(
1✔
219
            zip_longest(self.left, self.right, fillvalue=missing)
220
        ):
221
            if left_value == right_value:
1✔
222
                continue
1✔
223

224
            if left_value is missing:
1✔
225
                # the right list is longer, so store all the new values so that they can
226
                # just be added to the left list to patch it, and stop
227
                ops["ln"] = self.right[index:]
1✔
228
                break
1✔
229
            elif right_value is missing:
1✔
230
                # the left value is longer, so store the index from which elements in
231
                # the left list will be deleted to shorten it to the length of the right
232
                # list, and stop
233
                ops["ld"] = index
1✔
234
                break
1✔
235
            else:
236
                # a change in the values at this index in each list, check for nested
237
                # container objects and add Comparison objects to the list if any are
238
                # found of the same types
239
                if isinstance(left_value, dict) and isinstance(right_value, dict):
1✔
240
                    further_comparisons.append(
1✔
241
                        DictComparison((*self.path, index), left_value, right_value)
242
                    )
243
                elif isinstance(left_value, list) and isinstance(right_value, list):
1✔
244
                    further_comparisons.append(
1✔
245
                        ListComparison((*self.path, index), left_value, right_value)
246
                    )
247
                else:
248
                    changes.append((index, right_value))
1✔
249
        if changes:
1✔
250
            ops["lc"] = changes
1✔
251

252
        return DiffOp(self.path, ops) if ops else None, further_comparisons
1✔
253

254

255
class DiffingTypeComparisonException(Exception):
1✔
256
    """
257
    Exception raised if the base type and the new type passed to the diff function below
258
    are not both dicts.
259
    """
260

261
    pass
1✔
262

263

264
def diff(base: dict, new: dict) -> Iterable[DiffOp]:
1✔
265
    """
266
    Finds the differences between the two dicts, yielding DiffOps. Each DiffOp describes
267
    specific differences between the base dict and the new dict. By applying them all
268
    using the patch function below, the new dict can be recreated from the base dict.
269

270
    For efficiency, the DiffOps represent all the changes at a container level (e.g. a
271
    dict or list) not each specific change to every version at a specific key or index.
272
    This saves not only database space, but also allows for a faster patch function as
273
    changes can be applied en masse instead of individually.
274

275
    :param base: the base dict
276
    :param new: the new version of the base dict
277
    :return: yields DiffOps (if any changes are found)
278
    """
279
    if base == new:
1✔
280
        return
1✔
281

282
    if not isinstance(base, dict) or not isinstance(new, dict):
1✔
283
        raise DiffingTypeComparisonException("Both base and new must be dicts")
1✔
284

285
    # todo: we could write a shortcut when one of the dicts is empty
286

287
    queue: Deque[Comparison] = deque([DictComparison(tuple(), base, new)])
1✔
288
    while queue:
1✔
289
        comparison: Comparison = queue.popleft()
1✔
290
        diff_op, further_comparisons = comparison.compare()
1✔
291
        if diff_op:
1✔
292
            yield diff_op
1✔
293
        if further_comparisons:
1✔
294
            queue.extend(further_comparisons)
1✔
295

296

297
# # dynamically figure out the typing of the DiffOp based on it's annotations
298
# _diff_op_typing = get_type_hints(DiffOp)
299
# _DiffOpType = Tuple[_diff_op_typing["path"], _diff_op_typing["ops"]]
300

301

302
def patch(base: dict, ops: Collection[DiffOp]) -> dict:
1✔
303
    """
304
    Applies the operations in the ops iterable to the base dict, returning a new dict.
305
    If there are no operations to apply, the base dict is returned unchanged.
306

307
    Note that although the returned dict is new, the nested container values in it may
308
    or may not be new and could reference the same exact object as in the passed base
309
    dict. A nested container will be copied to avoid modifying the same container
310
    referenced in the base dict if there are any modifications made to it directly, or
311
    to any nested containers below it at any depth. If the nested container contains no
312
    changes to itself or its nested containers, it is not copied and the original
313
    reference to it from the base dict is used.
314

315
    :param base: the starting dict
316
    :param ops: the DiffOps to apply to the base dict (can be pure tuples, doesn't have
317
                to be DiffOp namedtuples)
318
    :return: a new dict with the changes applied
319
    """
320
    # nothing to do
321
    if len(ops) == 0:
1✔
322
        return base
1✔
323

324
    # create a copy of the base dict so that we don't modify it and can return a new one
325
    new = base.copy()
1✔
326

327
    for path, op in ops:
1✔
328
        # loop through the path finding the target of the operations we're going to
329
        # perform. At every point in the path, replace the container with a copy to
330
        # ensure we don't modify the container object from the base.
331
        target = new
1✔
332
        for key_or_index in path:
1✔
333
            target_copy = target[key_or_index].copy()
1✔
334
            target[key_or_index] = target_copy
1✔
335
            target = target_copy
1✔
336

337
        # dict ops
338
        if "dc" in op:
1✔
339
            target.update(op["dc"])
1✔
340
        if "dn" in op:
1✔
341
            target.update(op["dn"])
1✔
342
        if "dd" in op:
1✔
343
            for key in op["dd"]:
1✔
344
                del target[key]
1✔
345

346
        # list ops
347
        if "lc" in op:
1✔
348
            for index, value in op["lc"]:
1✔
349
                target[index] = value
1✔
350
        if "ln" in op:
1✔
351
            target.extend(op["ln"])
1✔
352
        if "ld" in op:
1✔
353
            del target[op["ld"] :]
1✔
354

355
    return new
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc