• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

nci / scores / 13171996478

06 Feb 2025 04:44AM UTC coverage: 99.962% (-0.04%) from 100.0%
13171996478

Pull #418

github

nicholasloveday
improve speed of bootstrap tests
Pull Request #418: (under development) 114 add circular block bootstrapping

357 of 357 branches covered (100.0%)

Branch coverage included in aggregate %.

123 of 124 new or added lines in 3 files covered. (99.19%)

2245 of 2246 relevant lines covered (99.96%)

4.0 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.33
/src/scores/processing/block_bootstrap.py
1
"""
2
Functions for performing block bootstrapping of arrays. This is inspired from
3
https://github.com/dougiesquire/xbootstrap with modifications to make the functions more
4
testable and also consistent with the scores package.
5
"""
6

7
import math
4✔
8
import os
4✔
9
from collections import OrderedDict
4✔
10
from itertools import chain, cycle, islice
4✔
11
from typing import Dict, List, Tuple, Union
4✔
12

13
import numpy as np
4✔
14
import xarray as xr
4✔
15

16
from scores.typing import XarrayLike
4✔
17

18
MAX_CHUNK_SIZE_MB = 200
4✔
19

20

21
def _get_blocked_random_indices(
4✔
22
    shape: list[int], block_axis: int, block_size: int, prev_block_sizes: list[int], circular: bool = True
23
) -> np.ndarray:
24
    """
25
    Return indices to randomly sample an axis of an array in consecutive
26
    (cyclic) blocks.
27

28
    Args:
29
        shape: The shape of the array to sample
30
        block_axis: The axis along which to sample blocks
31
        block_size: The size of each block to sample
32
        prev_block_sizes: Sizes of previous blocks along other axes
33
        circular: whether to sample block circularly.
34

35
    Returns:
36
        An array of indices to use for block resampling.
37
    """
38

39
    def _random_blocks(length, block, circular):
4✔
40
        """
41
        Indices to randomly sample blocks in a along an axis of a specified
42
        length
43
        """
44
        if block == length:
4✔
45
            return list(range(length))
4✔
46
        repeats = math.ceil(length / block)
4✔
47
        if circular:
4✔
48
            indices = list(
4✔
49
                chain.from_iterable(
50
                    islice(cycle(range(length)), s, s + block) for s in np.random.randint(0, length, repeats)
51
                )
52
            )
53
        else:
54
            indices = list(
4✔
55
                chain.from_iterable(
56
                    islice(range(length), s, s + block) for s in np.random.randint(0, length - block + 1, repeats)
57
                )
58
            )
59
        return indices[:length]
4✔
60

61
    # Don't randomise within an outer block
62
    if len(prev_block_sizes) > 0:
4✔
63
        orig_shape = shape.copy()
4✔
64
        for i, b in enumerate(prev_block_sizes[::-1]):
4✔
65
            prev_ax = block_axis - (i + 1)
4✔
66
            shape[prev_ax] = math.ceil(shape[prev_ax] / b)
4✔
67

68
    if block_size == 1:
4✔
69
        indices = np.random.randint(
4✔
70
            0,
71
            shape[block_axis],
72
            shape,
73
        )
74
    else:
75
        non_block_shapes = [s for i, s in enumerate(shape) if i != block_axis]
4✔
76
        indices = np.moveaxis(
4✔
77
            np.stack(
78
                [_random_blocks(shape[block_axis], block_size, circular) for _ in range(np.prod(non_block_shapes))],
79
                axis=-1,
80
            ).reshape([shape[block_axis]] + non_block_shapes),
81
            0,
82
            block_axis,
83
        )
84

85
    if len(prev_block_sizes) > 0:
4✔
86
        for i, b in enumerate(prev_block_sizes[::-1]):
4✔
87
            prev_ax = block_axis - (i + 1)
4✔
88
            indices = np.repeat(indices, b, axis=prev_ax).take(range(orig_shape[prev_ax]), axis=prev_ax)
4✔
89
        return indices
4✔
90
    return indices
4✔
91

92

93
def _n_nested_blocked_random_indices(
4✔
94
    sizes: OrderedDict[str, Tuple[int, int]], n_iteration: int, circular: bool = True
95
) -> OrderedDict[str, np.ndarray]:
96
    """
97
    Returns indices to randomly resample blocks of an array (with replacement)
98
    in a nested manner many times. Here, "nested" resampling means to randomly
99
    resample the first dimension, then for each randomly sampled element along
100
    that dimension, randomly resample the second dimension, then for each
101
    randomly sampled element along that dimension, randomly resample the third
102
    dimension etc.
103

104
    Args:
105
    sizes: Dictionary with {names: (sizes, blocks)} of the dimensions to resample
106
    n_iteration: The number of times to repeat the random resampling
107
    circular: Whether or not to do circular resampling.
108

109
    Returns:
110
        A dictionary of arrays containing indices for nested block resampling.
111

112
    """
113

114
    shape = [s[0] for s in sizes.values()]
4✔
115
    indices = OrderedDict()
4✔
116
    prev_blocks: List[int] = []
4✔
117
    for ax, (key, (_, block)) in enumerate(sizes.items()):
4✔
118
        indices[key] = _get_blocked_random_indices(shape[: ax + 1] + [n_iteration], ax, block, prev_blocks, circular)
4✔
119
        prev_blocks.append(block)
4✔
120
    return indices
4✔
121

122

123
def _expand_n_nested_random_indices(indices: list[np.ndarray]) -> Tuple[np.ndarray, ...]:
4✔
124
    """
125
    Expand the dimensions of the nested input arrays so that they can be
126
    broadcast and return a tuple that can be directly indexed
127

128
    Args:
129
    indices:List of numpy arrays of sequentially increasing dimension as output by
130
        the function ``_n_nested_blocked_random_indices``. The last axis on all
131
        inputs is assumed to correspond to the iteration axis
132

133
    Returns:
134
        Expanded indices suitable for broadcasting.
135
    """
136
    broadcast_ndim = indices[-1].ndim
4✔
137
    broadcast_indices = []
4✔
138
    for i, ind in enumerate(indices):
4✔
139
        expand_axes = list(range(i + 1, broadcast_ndim - 1))
4✔
140
        broadcast_indices.append(np.expand_dims(ind, axis=expand_axes))
4✔
141
    return (..., *tuple(broadcast_indices))
4✔
142

143

144
def _bootstrap(*arrays: np.ndarray, indices: List[np.ndarray]) -> Union[np.ndarray, Tuple[np.ndarray, ...]]:
4✔
145
    """
146
    Bootstrap the array(s) using the provided indices
147

148
    Args:
149
        arrays: list of arrays to bootstrap
150
        indices: list of arrays containing indices to use for bootstrapping each input array
151

152
    Returns:
153
        Bootstrapped arrays
154
    """
155
    bootstrapped = [array[ind] for array, ind in zip(arrays, indices)]
4✔
156
    if len(bootstrapped) == 1:
4✔
157
        return bootstrapped[0]
4✔
158
    return tuple(bootstrapped)
4✔
159

160

161
def _block_bootstrap(  # pylint: disable=too-many-locals
4✔
162
    array_list: List[XarrayLike],
163
    blocks: Dict[str, int],
164
    n_iteration: int,
165
    exclude_dims: Union[List[List[str]], None] = None,
166
    circular: bool = True,
167
) -> Tuple[xr.DataArray, ...]:
168
    """
169
    Repeatedly performs bootstrapping on provided arrays across specified dimensions, stacking
170
    the new arrays along a new "iteration" dimension. Bootstrapping is executed in a nested
171
    manner: the first provided dimension is bootstrapped, then for each bootstrapped sample
172
    along that dimension, the second provided dimension is bootstrapped, and so forth.
173

174
    Args:
175
        array_list: Data to bootstrap. Multiple arrays can be passed to be bootstrapped
176
            in the same way. All input arrays must have nested dimensions.
177
        blocks: Dictionary of dimension(s) to bootstrap and the block sizes to use
178
            along each dimension: ``{dim: blocksize}``. Nesting is based on the order of
179
            this dictionary.
180
        n_iteration: The number of iterations to repeat the bootstrapping process. Determines
181
            how many bootstrapped arrays will be generated and stacked along the iteration
182
            dimension.
183
        exclude_dims: An optional parameter indicating the dimensions to be excluded during
184
            bootstrapping for each arrays provided in ``arrays``. This parameter expects a list
185
            of lists, where each inner list corresponds to the dimensions to be excluded for
186
            the respective arrays. By default, the assumption is that no dimensions are
187
            excluded, and all arrays are bootstrapped across all specified dimensions in ``blocks``.
188
        circular: A boolean flag indicating whether circular block bootstrapping should be
189
            performed. Circular bootstrapping means that bootstrapping continues from the beginning
190
            when the end of the data is reached. By default, this parameter is set to True.
191

192
     Returns:
193
        Tuple of bootstrapped xarray DataArrays or Datasets, based on the input.
194

195
    Note:
196
        This function expands out the iteration dimension inside a universal function.
197
        However, this may generate very large chunks (multiplying chunk size by the number
198
        of iterations), causing issues for larger iterations. It's advisable to apply this
199
        function in blocks using 'block_bootstrap'.
200

201
    References:
202
    Wilks, Daniel S. Statistical methods in the atmospheric sciences. Vol. 100.
203
      Academic press, 2011.
204
    """
205
    # Rename exclude_dims so they are not bootstrapped
206
    if exclude_dims is None:
4✔
207
        exclude_dims = [[] for _ in range(len(array_list))]
4✔
208
    if not isinstance(exclude_dims, list) or not all(isinstance(x, list) for x in exclude_dims):
4✔
209
        raise ValueError("exclude_dims should be a list of lists")
4✔
210
    if len(exclude_dims) != len(array_list):
4✔
211
        raise ValueError(
4✔
212
            "exclude_dims should be a list of the same length as the number of "
213
            "arrays containing lists of dimensions to exclude for each array"
214
        )
215
    renames = []
4✔
216

217
    for i, (obj, exclude) in enumerate(zip(array_list, exclude_dims)):
4✔
218
        array_list[i] = obj.rename(
4✔
219
            {d: f"dim{ii}" for ii, d in enumerate(exclude)},
220
        )
221
        renames.append({f"dim{ii}": d for ii, d in enumerate(exclude)})
4✔
222
    dim = list(blocks.keys())
4✔
223

224
    # Ensure bootstrapped dimensions have consistent sizes across arrays_list
225
    for d in blocks.keys():
4✔
226
        dim_sizes = [o.sizes[d] for o in array_list if d in o.dims]
4✔
227
        if not all(s == dim_sizes[0] for s in dim_sizes):
4✔
228
            raise ValueError(f"Block dimension {d} is not the same size on all input arrays")
4✔
229

230
    # Get the sizes of the bootstrap dimensions
231
    sizes = None
4✔
232
    for obj in array_list:
4✔
233
        try:
4✔
234
            sizes = OrderedDict({d: (obj.sizes[d], b) for d, b in blocks.items()})
4✔
235
            break
4✔
236
        except KeyError:
4✔
237
            pass
4✔
238
    if sizes is None:
4✔
239
        raise ValueError(
4✔
240
            "At least one input array must contain all dimensions in blocks.keys()",
241
        )
242

243
    # Generate random indices for bootstrapping all arrays_list
244
    nested_indices = _n_nested_blocked_random_indices(sizes, n_iteration, circular)
4✔
245

246
    # Expand indices for broadcasting for each array separately
247
    indices = []
4✔
248
    input_core_dims = []
4✔
249
    for obj in array_list:
4✔
250
        available_dims = [d for d in dim if d in obj.dims]
4✔
251
        indices_to_expand = [nested_indices[key] for key in available_dims]
4✔
252

253
        indices.append(_expand_n_nested_random_indices(indices_to_expand))
4✔
254
        input_core_dims.append(available_dims)
4✔
255

256
    # Process arrays_list separately to handle non-matching dimensions
257
    result = []
4✔
258
    for obj, ind, core_dims in zip(array_list, indices, input_core_dims):
4✔
259
        if isinstance(obj, xr.Dataset):
4✔
260
            # Assume all variables have the same dtype
261
            output_dtype = obj[list(obj.data_vars)[0]].dtype
4✔
262
        else:
263
            output_dtype = obj.dtype
4✔
264

265
        result.append(
4✔
266
            xr.apply_ufunc(
267
                _bootstrap,
268
                obj,
269
                kwargs={"indices": [ind]},
270
                input_core_dims=[core_dims],
271
                output_core_dims=[core_dims + ["iteration"]],
272
                dask="parallelized",
273
                dask_gufunc_kwargs={"output_sizes": {"iteration": n_iteration}},
274
                output_dtypes=[output_dtype],
275
            )
276
        )
277

278
    # Rename excluded dimensions
279
    return tuple(res.rename(rename) for res, rename in zip(result, renames))
4✔
280

281

282
def block_bootstrap(
4✔
283
    array_list: List[XarrayLike] | XarrayLike,
284
    *,  # Enforce keyword-only arguments
285
    blocks: Dict[str, int],
286
    n_iteration: int,
287
    exclude_dims: Union[List[List[str]], None] = None,
288
    circular: bool = True,
289
) -> Union[XarrayLike, Tuple[XarrayLike, ...]]:
290
    """
291
    Perform block bootstrapping on provided arrays. The function creates new arrays by repeatedly
292
    bootstrapping along specified dimensions and stacking the new arrays alone a new "iteration"
293
    dimension. Additionally, it includes internal functions for chunk size calculation and
294
    handling Dask arrays for chunk size limitation.
295

296
    Args:
297
        array_list: The data to bootstrap, which can be a single xarray object or
298
            a list of multiple xarray objects. In the case where
299
            multiple datasets are passed, each dataset can have its own set of dimension. However,
300
            for successful bootstrapping, dimensions across all input arrays must be nested.
301
            For instance, for ``block.keys=['d1', 'd2', 'd3']``, an array with dimension 'd1' and
302
            'd2' is valid, but an array with only dimension 'd2' is not valid. All datasets
303
            are bootstrapped according to the same random samples along available dimensions.
304
        blocks: A dictionary specifying the dimension(s) to bootstrap and the block sizes to
305
            use along each dimension: ``{dimension: block_size}``. The keys represent the dimensions
306
            to be bootstrapped, and the values indicate the block sizes along each dimension.
307
            The dimension provided here should exist in the data provided in ``array_list``.
308
        n_iteration: The number of iterations to repeat the bootstrapping process. Determines
309
            how many bootstrapped arrays will be generated and stacked along the iteration
310
            dimension.
311
        exclude_dims: An optional parameter indicating the dimensions to be excluded during
312
            bootstrapping for each array provided in ``array_list``. This parameter expects a list
313
            of lists, where each inner list corresponds to the dimensions to be excluded for
314
            the respective array. By default, the assumption is that no dimensions are
315
            excluded, and all arrays are bootstrapped across all specified dimensions in ``blocks``.
316
        circular: A boolean flag indicating whether circular block bootstrapping should be
317
            performed. Circular bootstrapping means that bootstrapping continues from the beginning
318
            when the end of the data is reached. By default, this parameter is set to True.
319

320
    Returns:
321
        If a single Dataset/DataArray (XarrayLike) is provided, the functions returns a
322
        bootstrapped XarrayLike object along the "iteration" dimension. If multiple XarrayLike
323
        objects are provided, it returns a tuple of bootstrapped XarrayLike objects, each stacked
324
        along the "iteration" dimension.
325

326
    References:
327
        - Gilleland, E. (2020). Bootstrap Methods for Statistical Inference. Part I:
328
            Comparative Forecast Verification for Continuous Variables. Journal of
329
            Atmospheric and Oceanic Technology, 37(11), 2117–2134. https://doi.org/10.1175/jtech-d-20-0069.1
330
        - Wilks, D. S. (2011). Statistical methods in the atmospheric sciences. Academic press.
331
            https://doi.org/10.1016/C2017-0-03921-6
332
    """
333

334
    # While the most efficient method involves expanding the iteration dimension withing the
335
    # universal function, this approach might generate excessively large chunks (resulting
336
    # from multiplying chunk size by iterations) leading to issues with large numbers of
337
    # iterations. Hence, here function loops over blocks of iterations to generate the total
338
    # number of iterations.
339
    def _max_chunk_size_mb(ds):
4✔
340
        """
341
        Get the max chunk size in a dataset
342
        """
343
        ds = ds if isinstance(ds, xr.Dataset) else ds.to_dataset(name="ds")
4✔
344

345
        chunks = []
4✔
346
        for var in ds.data_vars:
4✔
347
            da = ds[var]
4✔
348
            chunk = da.chunks
4✔
349
            itemsize = da.data.itemsize
4✔
350
            size_of_chunk = itemsize * np.prod([np.max(x) for x in chunk]) / (1024**2)
4✔
351
            chunks.append(size_of_chunk)
4✔
352
        return max(chunks)
4✔
353

354
    if not isinstance(array_list, List):
4✔
355
        array_list = [array_list]
4✔
356
    # Choose iteration blocks to limit chunk size on dask arrays
357
    if array_list[0].chunks:  # Note: This is a way to check if the array is backed by a dask.array
4✔
358
        # without loading data into memory.
359
        # See https://docs.xarray.dev/en/stable/generated/xarray.DataArray.chunks.html
360
        ds_max_chunk_size_mb = max(_max_chunk_size_mb(obj) for obj in array_list)
4✔
361
        blocksize = int(MAX_CHUNK_SIZE_MB / ds_max_chunk_size_mb)
4✔
362
        blocksize = min(blocksize, n_iteration)
4✔
363
        blocksize = max(blocksize, 1)
4✔
364
    else:
365
        blocksize = n_iteration
4✔
366

367
    bootstraps = []
4✔
368
    for _ in range(blocksize, n_iteration + 1, blocksize):
4✔
369
        bootstraps.append(
4✔
370
            _block_bootstrap(
371
                array_list,
372
                blocks=blocks,
373
                n_iteration=blocksize,
374
                exclude_dims=exclude_dims,
375
                circular=circular,
376
            )
377
        )
378
    leftover = n_iteration % blocksize
4✔
379

380
    if leftover:
4✔
NEW
381
        bootstraps.append(
×
382
            _block_bootstrap(
383
                array_list,
384
                blocks=blocks,
385
                n_iteration=leftover,
386
                exclude_dims=exclude_dims,
387
                circular=circular,
388
            )
389
        )
390

391
    bootstraps_concat = tuple(
4✔
392
        xr.concat(
393
            bootstrap,
394
            dim="iteration",
395
            coords="minimal",
396
            compat="override",
397
        )
398
        for bootstrap in zip(*bootstraps)
399
    )
400

401
    if len(array_list) == 1:
4✔
402
        return bootstraps_concat[0]
4✔
403
    return bootstraps_concat
4✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc