• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

nci / scores / 16336054633

17 Jul 2025 04:13AM UTC coverage: 99.449% (-0.6%) from 100.0%
16336054633

Pull #856

github

tennlee
Allow tests to run for both dask and non-dask environments
Some pylint issues are reported, but I think these are mostly unrelated, further code refactoring may be needed to bring the code up to full health according to pylint
Pull Request #856: [WIP] Branch to track progress removing unexpected reliance on dask

390 of 391 branches covered (99.74%)

Branch coverage included in aggregate %.

2496 of 2511 relevant lines covered (99.4%)

3.98 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

90.32
/src/scores/processing/block_bootstrap_impl.py
1
"""
2
Functions for performing block bootstrapping of arrays. This is inspired from
3
https://github.com/dougiesquire/xbootstrap with modifications to make the functions more
4
testable and also consistent with the scores package.
5
"""
6

7
import math
4✔
8
import os
4✔
9
from collections import OrderedDict
4✔
10
from itertools import chain, cycle, islice
4✔
11
from typing import Dict, List, Tuple, Union
4✔
12

13
import numpy as np
4✔
14
import xarray as xr
4✔
15

16
from scores.typing import XarrayLike
4✔
17
from scores.utils import tmp_coord_name
4✔
18

19
# When Dask is being used, this constant helps control the sizes of batches
20
# when bootstrapping
21
MAX_BATCH_SIZE_MB = 200
4✔
22

23

24
def _get_blocked_random_indices(
4✔
25
    shape: list[int], block_axis: int, block_size: int, prev_block_sizes: list[int], circular: bool = True
26
) -> np.ndarray:
27
    """
28
    Return indices to randomly sample an axis of an array in consecutive
29
    (cyclic) blocks.
30

31
    Args:
32
        shape: The shape of the array to sample
33
        block_axis: The axis along which to sample blocks
34
        block_size: The size of each block to sample
35
        prev_block_sizes: Sizes of previous blocks along other axes
36
        circular: whether to sample block circularly.
37

38
    Returns:
39
        An array of indices to use for block resampling.
40
    """
41

42
    def _random_blocks(length, block, circular):
4✔
43
        """
44
        Indices to randomly sample blocks in a along an axis of a specified
45
        length
46
        """
47
        if block == length:
4✔
48
            return list(range(length))
4✔
49
        repeats = math.ceil(length / block)
4✔
50
        if circular:
4✔
51
            indices = list(
4✔
52
                chain.from_iterable(
53
                    islice(cycle(range(length)), s, s + block) for s in np.random.randint(0, length, repeats)
54
                )
55
            )
56
        else:
57
            indices = list(
4✔
58
                chain.from_iterable(
59
                    islice(range(length), s, s + block) for s in np.random.randint(0, length - block + 1, repeats)
60
                )
61
            )
62
        return indices[:length]
4✔
63

64
    # Don't randomise within an outer block
65
    if len(prev_block_sizes) > 0:
4✔
66
        orig_shape = shape.copy()
4✔
67
        for i, b in enumerate(prev_block_sizes[::-1]):
4✔
68
            prev_ax = block_axis - (i + 1)
4✔
69
            shape[prev_ax] = math.ceil(shape[prev_ax] / b)
4✔
70

71
    if block_size == 1:
4✔
72
        indices = np.random.randint(
4✔
73
            0,
74
            shape[block_axis],
75
            shape,
76
        )
77
    else:
78
        non_block_shapes = [s for i, s in enumerate(shape) if i != block_axis]
4✔
79
        indices = np.moveaxis(
4✔
80
            np.stack(
81
                [_random_blocks(shape[block_axis], block_size, circular) for _ in range(np.prod(non_block_shapes))],
82
                axis=-1,
83
            ).reshape([shape[block_axis]] + non_block_shapes),
84
            0,
85
            block_axis,
86
        )
87

88
    if len(prev_block_sizes) > 0:
4✔
89
        for i, b in enumerate(prev_block_sizes[::-1]):
4✔
90
            prev_ax = block_axis - (i + 1)
4✔
91
            indices = np.repeat(indices, b, axis=prev_ax).take(range(orig_shape[prev_ax]), axis=prev_ax)
4✔
92
        return indices
4✔
93
    return indices
4✔
94

95

96
def _n_nested_blocked_random_indices(
4✔
97
    sizes: OrderedDict[str, Tuple[int, int]], n_iteration: int, circular: bool = True
98
) -> OrderedDict[str, np.ndarray]:
99
    """
100
    Returns indices to randomly resample blocks of an array (with replacement)
101
    in a nested manner many times. Here, "nested" resampling means to randomly
102
    resample the first dimension, then for each randomly sampled element along
103
    that dimension, randomly resample the second dimension, then for each
104
    randomly sampled element along that dimension, randomly resample the third
105
    dimension etc.
106

107
    Args:
108
    sizes: Dictionary with {names: (sizes, blocks)} of the dimensions to resample
109
    n_iteration: The number of times to repeat the random resampling
110
    circular: Whether or not to do circular resampling.
111

112
    Returns:
113
        A dictionary of arrays containing indices for nested block resampling.
114

115
    """
116

117
    shape = [s[0] for s in sizes.values()]
4✔
118
    indices = OrderedDict()
4✔
119
    prev_blocks: List[int] = []
4✔
120
    for ax, (key, (_, block)) in enumerate(sizes.items()):
4✔
121
        indices[key] = _get_blocked_random_indices(shape[: ax + 1] + [n_iteration], ax, block, prev_blocks, circular)
4✔
122
        prev_blocks.append(block)
4✔
123
    return indices
4✔
124

125

126
def _expand_n_nested_random_indices(indices: list[np.ndarray]) -> Tuple[np.ndarray, ...]:
4✔
127
    """
128
    Expand the dimensions of the nested input arrays so that they can be
129
    broadcast and return a tuple that can be directly indexed
130

131
    Args:
132
    indices:List of numpy arrays of sequentially increasing dimension as output by
133
        the function ``_n_nested_blocked_random_indices``. The last axis on all
134
        inputs is assumed to correspond to the iteration axis
135

136
    Returns:
137
        Expanded indices suitable for broadcasting.
138
    """
139
    broadcast_ndim = indices[-1].ndim
4✔
140
    broadcast_indices = []
4✔
141
    for i, ind in enumerate(indices):
4✔
142
        expand_axes = list(range(i + 1, broadcast_ndim - 1))
4✔
143
        broadcast_indices.append(np.expand_dims(ind, axis=expand_axes))
4✔
144
    return (..., *tuple(broadcast_indices))
4✔
145

146

147
def _bootstrap(*arrays: np.ndarray, indices: List[np.ndarray]) -> Union[np.ndarray, Tuple[np.ndarray, ...]]:
4✔
148
    """
149
    Bootstrap the array(s) using the provided indices
150

151
    Args:
152
        arrays: list of arrays to bootstrap
153
        indices: list of arrays containing indices to use for bootstrapping each input array
154

155
    Returns:
156
        Bootstrapped arrays
157
    """
158
    bootstrapped = [array[ind] for array, ind in zip(arrays, indices)]
4✔
159
    if len(bootstrapped) == 1:
4✔
160
        return bootstrapped[0]
4✔
161
    return tuple(bootstrapped)
4✔
162

163

164
def _block_bootstrap(  # pylint: disable=too-many-locals
4✔
165
    array_list: List[XarrayLike],
166
    blocks: Dict[str, int],
167
    n_iteration: int,
168
    exclude_dims: Union[List[List[str]], None] = None,
169
    circular: bool = True,
170
) -> Tuple[xr.DataArray, ...]:
171
    """
172
    Repeatedly performs bootstrapping on provided arrays across specified dimensions, stacking
173
    the new arrays along a new "iteration" dimension. Bootstrapping is executed in a nested
174
    manner: the first provided dimension is bootstrapped, then for each bootstrapped sample
175
    along that dimension, the second provided dimension is bootstrapped, and so forth.
176

177
    Args:
178
        array_list: Data to bootstrap. Multiple arrays can be passed to be bootstrapped
179
            in the same way. All input arrays must have nested dimensions.
180
        blocks: Dictionary of dimension(s) to bootstrap and the block sizes to use
181
            along each dimension: ``{dim: blocksize}``. Nesting is based on the order of
182
            this dictionary.
183
        n_iteration: The number of iterations to repeat the bootstrapping process. Determines
184
            how many bootstrapped arrays will be generated and stacked along the iteration
185
            dimension.
186
        exclude_dims: An optional parameter indicating the dimensions to be excluded during
187
            bootstrapping for each arrays provided in ``arrays``. This parameter expects a list
188
            of lists, where each inner list corresponds to the dimensions to be excluded for
189
            the respective arrays. By default, the assumption is that no dimensions are
190
            excluded, and all arrays are bootstrapped across all specified dimensions in ``blocks``.
191
        circular: A boolean flag indicating whether circular block bootstrapping should be
192
            performed. Circular bootstrapping means that bootstrapping continues from the beginning
193
            when the end of the data is reached. By default, this parameter is set to True.
194

195
     Returns:
196
        Tuple of bootstrapped xarray DataArrays or Datasets, based on the input.
197

198
    Note:
199
        This function expands out the iteration dimension inside a universal function.
200
        However, this may generate very large chunks (multiplying chunk size by the number
201
        of iterations), causing issues for larger iterations. It's advisable to apply this
202
        function in blocks using 'block_bootstrap'.
203

204
    References:
205
    Wilks, Daniel S. Statistical methods in the atmospheric sciences. Vol. 100.
206
      Academic press, 2011.
207
    """
208
    # Rename exclude_dims so they are not bootstrapped
209
    if exclude_dims is None:
4✔
210
        exclude_dims = [[] for _ in range(len(array_list))]
4✔
211
    if not isinstance(exclude_dims, list) or not all(isinstance(x, list) for x in exclude_dims):
4✔
212
        raise ValueError("exclude_dims should be a list of lists")
4✔
213
    if len(exclude_dims) != len(array_list):
4✔
214
        raise ValueError(
4✔
215
            "exclude_dims should be a list of the same length as the number of arrays in array_list",
216
        )
217
    renames = []
4✔
218
    for i, (obj, exclude) in enumerate(zip(array_list, exclude_dims)):
4✔
219
        new_dim_list = tmp_coord_name(obj, count=len(exclude))
4✔
220
        if isinstance(new_dim_list, str):
4✔
221
            new_dim_list = [new_dim_list]
4✔
222
        rename_dict = {d: f"{new_dim_list[ii]}" for ii, d in enumerate(exclude)}
4✔
223
        array_list[i] = obj.rename(rename_dict)
4✔
224
        renames.append({v: k for k, v in rename_dict.items()})
4✔
225

226
    dim = list(blocks.keys())
4✔
227

228
    # Ensure bootstrapped dimensions have consistent sizes across arrays_list
229
    for d in blocks.keys():
4✔
230
        dim_sizes = [o.sizes[d] for o in array_list if d in o.dims]
4✔
231
        if not all(s == dim_sizes[0] for s in dim_sizes):
4✔
232
            raise ValueError(f"Block dimension {d} is not the same size on all input arrays")
4✔
233

234
    # Get the sizes of the bootstrap dimensions
235
    sizes = None
4✔
236
    for obj in array_list:
4✔
237
        try:
4✔
238
            sizes = OrderedDict({d: (obj.sizes[d], b) for d, b in blocks.items()})
4✔
239
            break
4✔
240
        except KeyError:
4✔
241
            pass
4✔
242
    if sizes is None:
4✔
243
        raise ValueError(
4✔
244
            "At least one input array must contain all dimensions in blocks.keys()",
245
        )
246

247
    # Generate random indices for bootstrapping all arrays_list
248
    nested_indices = _n_nested_blocked_random_indices(sizes, n_iteration, circular)
4✔
249

250
    # Expand indices for broadcasting for each array separately
251
    indices = []
4✔
252
    input_core_dims = []
4✔
253
    for obj in array_list:
4✔
254
        available_dims = [d for d in dim if d in obj.dims]
4✔
255
        indices_to_expand = [nested_indices[key] for key in available_dims]
4✔
256

257
        indices.append(_expand_n_nested_random_indices(indices_to_expand))
4✔
258
        input_core_dims.append(available_dims)
4✔
259

260
    # Process arrays_list separately to handle non-matching dimensions
261
    result = []
4✔
262
    for obj, ind, core_dims in zip(array_list, indices, input_core_dims):
4✔
263
        if isinstance(obj, xr.Dataset):
4✔
264
            # Assume all variables have the same dtype
265
            output_dtype = obj[list(obj.data_vars)[0]].dtype
4✔
266
        else:
267
            output_dtype = obj.dtype
4✔
268

269
        result.append(
4✔
270
            xr.apply_ufunc(
271
                _bootstrap,
272
                obj,
273
                kwargs={"indices": [ind]},
274
                input_core_dims=[core_dims],
275
                output_core_dims=[core_dims + ["iteration"]],
276
                dask="parallelized",
277
                dask_gufunc_kwargs={"output_sizes": {"iteration": n_iteration}},
278
                output_dtypes=[output_dtype],
279
            )
280
        )
281

282
    # Rename excluded dimensions
283
    return tuple(res.rename(rename) for res, rename in zip(result, renames))
4✔
284

285

286
def block_bootstrap(
4✔
287
    array_list: List[XarrayLike] | XarrayLike,
288
    *,  # Enforce keyword-only arguments
289
    blocks: Dict[str, int],
290
    n_iteration: int,
291
    exclude_dims: Union[List[List[str]], None] = None,
292
    circular: bool = True,
293
) -> Union[XarrayLike, Tuple[XarrayLike, ...]]:
294
    """
295
    Perform block bootstrapping on provided arrays. The function creates new arrays by repeatedly
296
    bootstrapping along specified dimensions and stacking the new arrays along a new "iteration"
297
    dimension. Additionally, it includes internal functions for chunk size calculation and
298
    handling Dask arrays for chunk size limitation.
299

300
    Args:
301
        array_list: The data to bootstrap, which can be a single xarray object or
302
            a list of multiple xarray objects. In the case where
303
            multiple datasets are passed, each dataset can have its own set of dimension. However,
304
            for successful bootstrapping, dimensions across all input arrays must be nested.
305
            For instance, for ``block.keys=['d1', 'd2', 'd3']``, an array with dimension 'd1' and
306
            'd2' is valid, but an array with only dimension 'd2' is not valid. All datasets
307
            are bootstrapped according to the same random samples along available dimensions.
308
        blocks: A dictionary specifying the dimension(s) to bootstrap and the block sizes to
309
            use along each dimension: ``{dimension: block_size}``. The keys represent the dimensions
310
            to be bootstrapped, and the values indicate the block sizes along each dimension.
311
            The dimension provided here should exist in the data provided in ``array_list``.
312
        n_iteration: The number of iterations to repeat the bootstrapping process. Determines
313
            how many bootstrapped arrays will be generated and stacked along the iteration
314
            dimension.
315
        exclude_dims: An optional parameter indicating the dimensions to be excluded during
316
            bootstrapping for each array provided in ``array_list``. This parameter expects a list
317
            of lists, where each inner list corresponds to the dimensions to be excluded for
318
            the respective array. By default, the assumption is that no dimensions are
319
            excluded, and all arrays are bootstrapped across all specified dimensions in ``blocks``.
320
        circular: A boolean flag indicating whether circular block bootstrapping should be
321
            performed. Circular bootstrapping means that bootstrapping continues from the beginning
322
            when the end of the data is reached. By default, this parameter is set to True.
323

324
    Returns:
325
        If a single Dataset/DataArray (XarrayLike) is provided, the functions returns a
326
        bootstrapped XarrayLike object along the "iteration" dimension. If multiple XarrayLike
327
        objects are provided, it returns a tuple of bootstrapped XarrayLike objects, each stacked
328
        along the "iteration" dimension.
329

330
    Raises:
331
        ValueError: If bootstrapped dimensions don't consistent sizes across ``arrays_list``.
332
        ValueError: If there is not at least one input array that contains all dimensions in blocks.keys().
333
        ValueError: If ``exclude_dims`` is not a list of lists.
334
        ValueError: If the list ``exclude_dims`` is not the same length as the number of
335
            as ``array_list``.
336

337
    References:
338
        - Gilleland, E. (2020). Bootstrap Methods for Statistical Inference. Part I:
339
          Comparative Forecast Verification for Continuous Variables. Journal of
340
          Atmospheric and Oceanic Technology, 37(11), 2117–2134. https://doi.org/10.1175/jtech-d-20-0069.1
341
        - Wilks, D. S. (2011). Statistical methods in the atmospheric sciences. Academic press.
342
          https://doi.org/10.1016/C2017-0-03921-6
343

344
    Examples:
345
        Bootstrap a fcst and obs dataset along the time and space dimensions with block sizes of 10
346
        for each dimension. The bootstrapping is repeated 1000 times.
347

348
        >>> import numpy as np
349
        >>> import xarray as xr
350
        >>> from scores.processing import block_bootstrap
351
        >>> obs = xr.DataArray(np.random.rand(100, 100), dims=["time", "space"])
352
        >>> fcst = xr.DataArray(np.random.rand(100, 100), dims=["time", "space"])
353
        >>> bootstrapped_obs, bootstrapped_fcst = block_bootstrap(
354
        ...     [obs, fcst],
355
        ...     blocks={"time": 10, "space": 10},
356
        ...     n_iteration=1000,
357
        ... )
358
    """
359

360
    # While the most efficient method involves expanding the iteration dimension withing the
361
    # universal function, this approach might generate excessively large chunks (resulting
362
    # from multiplying chunk size by iterations) leading to issues with large numbers of
363
    # iterations. Hence, here function loops over blocks of iterations to generate the total
364
    # number of iterations.
365
    def _max_chunk_size_mb(ds):
4✔
366
        """
367
        Get the max chunk size in a dataset
368
        """
369
        ds = ds if isinstance(ds, xr.Dataset) else ds.to_dataset(name="ds")
×
370

371
        chunks = []
×
372
        for var in ds.data_vars:
×
373
            da = ds[var]
×
374
            chunk = da.chunks
×
375
            itemsize = da.data.itemsize
×
376
            size_of_chunk = itemsize * np.prod([np.max(x) for x in chunk]) / (1024**2)
×
377
            chunks.append(size_of_chunk)
×
378
        return max(chunks)
×
379

380
    if not isinstance(array_list, List):
4✔
381
        array_list = [array_list]
4✔
382
    # Choose iteration blocks to limit chunk size on dask arrays
383
    if array_list[0].chunks:  # Note: This is a way to check if the array is backed by a dask.array
4✔
384
        # without loading data into memory.
385
        # See https://docs.xarray.dev/en/stable/generated/xarray.DataArray.chunks.html
386
        ds_max_chunk_size_mb = max(_max_chunk_size_mb(obj) for obj in array_list)
×
387
        blocksize = int(MAX_BATCH_SIZE_MB / ds_max_chunk_size_mb)
×
388
        blocksize = min(blocksize, n_iteration)
×
389
        blocksize = max(blocksize, 1)
×
390
    else:
391
        blocksize = n_iteration
4✔
392

393
    bootstraps = []
4✔
394
    for _ in range(blocksize, n_iteration + 1, blocksize):
4✔
395
        bootstraps.append(
4✔
396
            _block_bootstrap(
397
                array_list,
398
                blocks=blocks,
399
                n_iteration=blocksize,
400
                exclude_dims=exclude_dims,
401
                circular=circular,
402
            )
403
        )
404
    leftover = n_iteration % blocksize
4✔
405

406
    if leftover:
4✔
407
        bootstraps.append(
×
408
            _block_bootstrap(
409
                array_list,
410
                blocks=blocks,
411
                n_iteration=leftover,
412
                exclude_dims=exclude_dims,
413
                circular=circular,
414
            )
415
        )
416

417
    bootstraps_concat = tuple(
4✔
418
        xr.concat(
419
            bootstrap,
420
            dim="iteration",
421
            coords="minimal",
422
            compat="override",
423
        )
424
        for bootstrap in zip(*bootstraps)
425
    )
426

427
    if len(array_list) == 1:
4✔
428
        return bootstraps_concat[0]
4✔
429
    return bootstraps_concat
4✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc