13171996478

Committed 06 Feb 2025 04:44AM UTC coverage: 99.962% (-0.04%) from 100.0%

Build # 13171996478

Build Type

Pull #418

github

Committed by

nicholasloveday

Commit Message

improve speed of bootstrap tests

Pull Request Pull Request #418: (under development) 114 add circular block bootstrapping

Coverage Stats

357 of 357 branches covered (100.0%)

Branch coverage included in aggregate %.

123 of 124 new or added lines in 3 files covered. (99.19%)

2245 of 2246 relevant lines covered (99.96%)

4.0 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.33

/src/scores/processing/block_bootstrap.py

"""
Functions for performing block bootstrapping of arrays. This is inspired from
https://github.com/dougiesquire/xbootstrap with modifications to make the functions more
testable and also consistent with the scores package.
"""

import math
import os
from collections import OrderedDict
from itertools import chain, cycle, islice
from typing import Dict, List, Tuple, Union

import numpy as np
import xarray as xr

from scores.typing import XarrayLike

MAX_CHUNK_SIZE_MB = 200


def _get_blocked_random_indices(
    shape: list[int], block_axis: int, block_size: int, prev_block_sizes: list[int], circular: bool = True
) -> np.ndarray:
    """
    Return indices to randomly sample an axis of an array in consecutive
    (cyclic) blocks.

    Args:
        shape: The shape of the array to sample
        block_axis: The axis along which to sample blocks
        block_size: The size of each block to sample
        prev_block_sizes: Sizes of previous blocks along other axes
        circular: whether to sample block circularly.

    Returns:
        An array of indices to use for block resampling.
    """

    def _random_blocks(length, block, circular):
        """
        Indices to randomly sample blocks in a along an axis of a specified
        length
        """
        if block == length:
            return list(range(length))
        repeats = math.ceil(length / block)
        if circular:
            indices = list(
                chain.from_iterable(
                    islice(cycle(range(length)), s, s + block) for s in np.random.randint(0, length, repeats)
                )
            )
        else:
            indices = list(
                chain.from_iterable(
                    islice(range(length), s, s + block) for s in np.random.randint(0, length - block + 1, repeats)
                )
            )
        return indices[:length]

    # Don't randomise within an outer block
    if len(prev_block_sizes) > 0:
        orig_shape = shape.copy()
        for i, b in enumerate(prev_block_sizes[::-1]):
            prev_ax = block_axis - (i + 1)
            shape[prev_ax] = math.ceil(shape[prev_ax] / b)

    if block_size == 1:
        indices = np.random.randint(
            0,
            shape[block_axis],
            shape,
        )
    else:
        non_block_shapes = [s for i, s in enumerate(shape) if i != block_axis]
        indices = np.moveaxis(
            np.stack(
                [_random_blocks(shape[block_axis], block_size, circular) for _ in range(np.prod(non_block_shapes))],
                axis=-1,
            ).reshape([shape[block_axis]] + non_block_shapes),
            0,
            block_axis,
        )

    if len(prev_block_sizes) > 0:
        for i, b in enumerate(prev_block_sizes[::-1]):
            prev_ax = block_axis - (i + 1)
            indices = np.repeat(indices, b, axis=prev_ax).take(range(orig_shape[prev_ax]), axis=prev_ax)
        return indices
    return indices


def _n_nested_blocked_random_indices(
    sizes: OrderedDict[str, Tuple[int, int]], n_iteration: int, circular: bool = True
) -> OrderedDict[str, np.ndarray]:
    """
    Returns indices to randomly resample blocks of an array (with replacement)
    in a nested manner many times. Here, "nested" resampling means to randomly
    resample the first dimension, then for each randomly sampled element along
    that dimension, randomly resample the second dimension, then for each
    randomly sampled element along that dimension, randomly resample the third
    dimension etc.

    Args:
    sizes: Dictionary with {names: (sizes, blocks)} of the dimensions to resample
    n_iteration: The number of times to repeat the random resampling
    circular: Whether or not to do circular resampling.

    Returns:
        A dictionary of arrays containing indices for nested block resampling.

    """

    shape = [s[0] for s in sizes.values()]
    indices = OrderedDict()
    prev_blocks: List[int] = []
    for ax, (key, (_, block)) in enumerate(sizes.items()):
        indices[key] = _get_blocked_random_indices(shape[: ax + 1] + [n_iteration], ax, block, prev_blocks, circular)
        prev_blocks.append(block)
    return indices


def _expand_n_nested_random_indices(indices: list[np.ndarray]) -> Tuple[np.ndarray, ...]:
    """
    Expand the dimensions of the nested input arrays so that they can be
    broadcast and return a tuple that can be directly indexed

    Args:
    indices:List of numpy arrays of sequentially increasing dimension as output by
        the function ``_n_nested_blocked_random_indices``. The last axis on all
        inputs is assumed to correspond to the iteration axis

    Returns:
        Expanded indices suitable for broadcasting.
    """
    broadcast_ndim = indices[-1].ndim
    broadcast_indices = []
    for i, ind in enumerate(indices):
        expand_axes = list(range(i + 1, broadcast_ndim - 1))
        broadcast_indices.append(np.expand_dims(ind, axis=expand_axes))
    return (..., *tuple(broadcast_indices))


def _bootstrap(*arrays: np.ndarray, indices: List[np.ndarray]) -> Union[np.ndarray, Tuple[np.ndarray, ...]]:
    """
    Bootstrap the array(s) using the provided indices

    Args:
        arrays: list of arrays to bootstrap
        indices: list of arrays containing indices to use for bootstrapping each input array

    Returns:
        Bootstrapped arrays
    """
    bootstrapped = [array[ind] for array, ind in zip(arrays, indices)]
    if len(bootstrapped) == 1:
        return bootstrapped[0]
    return tuple(bootstrapped)


def _block_bootstrap(  # pylint: disable=too-many-locals
    array_list: List[XarrayLike],
    blocks: Dict[str, int],
    n_iteration: int,
    exclude_dims: Union[List[List[str]], None] = None,
    circular: bool = True,
) -> Tuple[xr.DataArray, ...]:
    """
    Repeatedly performs bootstrapping on provided arrays across specified dimensions, stacking
    the new arrays along a new "iteration" dimension. Bootstrapping is executed in a nested
    manner: the first provided dimension is bootstrapped, then for each bootstrapped sample
    along that dimension, the second provided dimension is bootstrapped, and so forth.

    Args:
        array_list: Data to bootstrap. Multiple arrays can be passed to be bootstrapped
            in the same way. All input arrays must have nested dimensions.
        blocks: Dictionary of dimension(s) to bootstrap and the block sizes to use
            along each dimension: ``{dim: blocksize}``. Nesting is based on the order of
            this dictionary.
        n_iteration: The number of iterations to repeat the bootstrapping process. Determines
            how many bootstrapped arrays will be generated and stacked along the iteration
            dimension.
        exclude_dims: An optional parameter indicating the dimensions to be excluded during
            bootstrapping for each arrays provided in ``arrays``. This parameter expects a list
            of lists, where each inner list corresponds to the dimensions to be excluded for
            the respective arrays. By default, the assumption is that no dimensions are
            excluded, and all arrays are bootstrapped across all specified dimensions in ``blocks``.
        circular: A boolean flag indicating whether circular block bootstrapping should be
            performed. Circular bootstrapping means that bootstrapping continues from the beginning
            when the end of the data is reached. By default, this parameter is set to True.

     Returns:
        Tuple of bootstrapped xarray DataArrays or Datasets, based on the input.

    Note:
        This function expands out the iteration dimension inside a universal function.
        However, this may generate very large chunks (multiplying chunk size by the number
        of iterations), causing issues for larger iterations. It's advisable to apply this
        function in blocks using 'block_bootstrap'.

    References:
    Wilks, Daniel S. Statistical methods in the atmospheric sciences. Vol. 100.
      Academic press, 2011.
    """
    # Rename exclude_dims so they are not bootstrapped
    if exclude_dims is None:
        exclude_dims = [[] for _ in range(len(array_list))]
    if not isinstance(exclude_dims, list) or not all(isinstance(x, list) for x in exclude_dims):
        raise ValueError("exclude_dims should be a list of lists")
    if len(exclude_dims) != len(array_list):
        raise ValueError(
            "exclude_dims should be a list of the same length as the number of "
            "arrays containing lists of dimensions to exclude for each array"
        )
    renames = []

    for i, (obj, exclude) in enumerate(zip(array_list, exclude_dims)):
        array_list[i] = obj.rename(
            {d: f"dim{ii}" for ii, d in enumerate(exclude)},
        )
        renames.append({f"dim{ii}": d for ii, d in enumerate(exclude)})
    dim = list(blocks.keys())

    # Ensure bootstrapped dimensions have consistent sizes across arrays_list
    for d in blocks.keys():
        dim_sizes = [o.sizes[d] for o in array_list if d in o.dims]
        if not all(s == dim_sizes[0] for s in dim_sizes):
            raise ValueError(f"Block dimension {d} is not the same size on all input arrays")

    # Get the sizes of the bootstrap dimensions
    sizes = None
    for obj in array_list:
        try:
            sizes = OrderedDict({d: (obj.sizes[d], b) for d, b in blocks.items()})
            break
        except KeyError:
            pass
    if sizes is None:
        raise ValueError(
            "At least one input array must contain all dimensions in blocks.keys()",
        )

    # Generate random indices for bootstrapping all arrays_list
    nested_indices = _n_nested_blocked_random_indices(sizes, n_iteration, circular)

    # Expand indices for broadcasting for each array separately
    indices = []
    input_core_dims = []
    for obj in array_list:
        available_dims = [d for d in dim if d in obj.dims]
        indices_to_expand = [nested_indices[key] for key in available_dims]

        indices.append(_expand_n_nested_random_indices(indices_to_expand))
        input_core_dims.append(available_dims)

    # Process arrays_list separately to handle non-matching dimensions
    result = []
    for obj, ind, core_dims in zip(array_list, indices, input_core_dims):
        if isinstance(obj, xr.Dataset):
            # Assume all variables have the same dtype
            output_dtype = obj[list(obj.data_vars)[0]].dtype
        else:
            output_dtype = obj.dtype

        result.append(
            xr.apply_ufunc(
                _bootstrap,
                obj,
                kwargs={"indices": [ind]},
                input_core_dims=[core_dims],
                output_core_dims=[core_dims + ["iteration"]],
                dask="parallelized",
                dask_gufunc_kwargs={"output_sizes": {"iteration": n_iteration}},
                output_dtypes=[output_dtype],
            )
        )

    # Rename excluded dimensions
    return tuple(res.rename(rename) for res, rename in zip(result, renames))


def block_bootstrap(
    array_list: List[XarrayLike] | XarrayLike,
    *,  # Enforce keyword-only arguments
    blocks: Dict[str, int],
    n_iteration: int,
    exclude_dims: Union[List[List[str]], None] = None,
    circular: bool = True,
) -> Union[XarrayLike, Tuple[XarrayLike, ...]]:
    """
    Perform block bootstrapping on provided arrays. The function creates new arrays by repeatedly
    bootstrapping along specified dimensions and stacking the new arrays alone a new "iteration"
    dimension. Additionally, it includes internal functions for chunk size calculation and
    handling Dask arrays for chunk size limitation.

    Args:
        array_list: The data to bootstrap, which can be a single xarray object or
            a list of multiple xarray objects. In the case where
            multiple datasets are passed, each dataset can have its own set of dimension. However,
            for successful bootstrapping, dimensions across all input arrays must be nested.
            For instance, for ``block.keys=['d1', 'd2', 'd3']``, an array with dimension 'd1' and
            'd2' is valid, but an array with only dimension 'd2' is not valid. All datasets
            are bootstrapped according to the same random samples along available dimensions.
        blocks: A dictionary specifying the dimension(s) to bootstrap and the block sizes to
            use along each dimension: ``{dimension: block_size}``. The keys represent the dimensions
            to be bootstrapped, and the values indicate the block sizes along each dimension.
            The dimension provided here should exist in the data provided in ``array_list``.
        n_iteration: The number of iterations to repeat the bootstrapping process. Determines
            how many bootstrapped arrays will be generated and stacked along the iteration
            dimension.
        exclude_dims: An optional parameter indicating the dimensions to be excluded during
            bootstrapping for each array provided in ``array_list``. This parameter expects a list
            of lists, where each inner list corresponds to the dimensions to be excluded for
            the respective array. By default, the assumption is that no dimensions are
            excluded, and all arrays are bootstrapped across all specified dimensions in ``blocks``.
        circular: A boolean flag indicating whether circular block bootstrapping should be
            performed. Circular bootstrapping means that bootstrapping continues from the beginning
            when the end of the data is reached. By default, this parameter is set to True.

    Returns:
        If a single Dataset/DataArray (XarrayLike) is provided, the functions returns a
        bootstrapped XarrayLike object along the "iteration" dimension. If multiple XarrayLike
        objects are provided, it returns a tuple of bootstrapped XarrayLike objects, each stacked
        along the "iteration" dimension.

    References:
        - Gilleland, E. (2020). Bootstrap Methods for Statistical Inference. Part I:
            Comparative Forecast Verification for Continuous Variables. Journal of
            Atmospheric and Oceanic Technology, 37(11), 2117–2134. https://doi.org/10.1175/jtech-d-20-0069.1
        - Wilks, D. S. (2011). Statistical methods in the atmospheric sciences. Academic press.
            https://doi.org/10.1016/C2017-0-03921-6
    """

    # While the most efficient method involves expanding the iteration dimension withing the
    # universal function, this approach might generate excessively large chunks (resulting
    # from multiplying chunk size by iterations) leading to issues with large numbers of
    # iterations. Hence, here function loops over blocks of iterations to generate the total
    # number of iterations.
    def _max_chunk_size_mb(ds):
        """
        Get the max chunk size in a dataset
        """
        ds = ds if isinstance(ds, xr.Dataset) else ds.to_dataset(name="ds")

        chunks = []
        for var in ds.data_vars:
            da = ds[var]
            chunk = da.chunks
            itemsize = da.data.itemsize
            size_of_chunk = itemsize * np.prod([np.max(x) for x in chunk]) / (1024**2)
            chunks.append(size_of_chunk)
        return max(chunks)

    if not isinstance(array_list, List):
        array_list = [array_list]
    # Choose iteration blocks to limit chunk size on dask arrays
    if array_list[0].chunks:  # Note: This is a way to check if the array is backed by a dask.array
        # without loading data into memory.
        # See https://docs.xarray.dev/en/stable/generated/xarray.DataArray.chunks.html
        ds_max_chunk_size_mb = max(_max_chunk_size_mb(obj) for obj in array_list)
        blocksize = int(MAX_CHUNK_SIZE_MB / ds_max_chunk_size_mb)
        blocksize = min(blocksize, n_iteration)
        blocksize = max(blocksize, 1)
    else:
        blocksize = n_iteration

    bootstraps = []
    for _ in range(blocksize, n_iteration + 1, blocksize):
        bootstraps.append(
            _block_bootstrap(
                array_list,
                blocks=blocks,
                n_iteration=blocksize,
                exclude_dims=exclude_dims,
                circular=circular,
            )
        )
    leftover = n_iteration % blocksize

    if leftover:
        bootstraps.append(
            _block_bootstrap(
                array_list,
                blocks=blocks,
                n_iteration=leftover,
                exclude_dims=exclude_dims,
                circular=circular,
            )
        )

    bootstraps_concat = tuple(
        xr.concat(
            bootstrap,
            dim="iteration",
            coords="minimal",
            compat="override",
        )
        for bootstrap in zip(*bootstraps)
    )

    if len(array_list) == 1:
        return bootstraps_concat[0]
    return bootstraps_concat

1	"""
2	Functions for performing block bootstrapping of arrays. This is inspired from
3	https://github.com/dougiesquire/xbootstrap with modifications to make the functions more
4	testable and also consistent with the scores package.
5	"""
6
7	import math	4✔
8	import os	4✔
9	from collections import OrderedDict	4✔
10	from itertools import chain, cycle, islice	4✔
11	from typing import Dict, List, Tuple, Union	4✔
12
13	import numpy as np	4✔
14	import xarray as xr	4✔
15
16	from scores.typing import XarrayLike	4✔
17
18	MAX_CHUNK_SIZE_MB = 200	4✔
19
20
21	def _get_blocked_random_indices(	4✔
22	shape: list[int], block_axis: int, block_size: int, prev_block_sizes: list[int], circular: bool = True
23	) -> np.ndarray:
24	"""
25	Return indices to randomly sample an axis of an array in consecutive
26	(cyclic) blocks.
27
28	Args:
29	shape: The shape of the array to sample
30	block_axis: The axis along which to sample blocks
31	block_size: The size of each block to sample
32	prev_block_sizes: Sizes of previous blocks along other axes
33	circular: whether to sample block circularly.
34
35	Returns:
36	An array of indices to use for block resampling.
37	"""
38
39	def _random_blocks(length, block, circular):	4✔
40	"""
41	Indices to randomly sample blocks in a along an axis of a specified
42	length
43	"""
44	if block == length:	4✔
45	return list(range(length))	4✔
46	repeats = math.ceil(length / block)	4✔
47	if circular:	4✔
48	indices = list(	4✔
49	chain.from_iterable(
50	islice(cycle(range(length)), s, s + block) for s in np.random.randint(0, length, repeats)
51	)
52	)
53	else:
54	indices = list(	4✔
55	chain.from_iterable(
56	islice(range(length), s, s + block) for s in np.random.randint(0, length - block + 1, repeats)
57	)
58	)
59	return indices[:length]	4✔
60
61	# Don't randomise within an outer block
62	if len(prev_block_sizes) > 0:	4✔
63	orig_shape = shape.copy()	4✔
64	for i, b in enumerate(prev_block_sizes[::-1]):	4✔
65	prev_ax = block_axis - (i + 1)	4✔
66	shape[prev_ax] = math.ceil(shape[prev_ax] / b)	4✔
67
68	if block_size == 1:	4✔
69	indices = np.random.randint(	4✔
70	0,
71	shape[block_axis],
72	shape,
73	)
74	else:
75	non_block_shapes = [s for i, s in enumerate(shape) if i != block_axis]	4✔
76	indices = np.moveaxis(	4✔
77	np.stack(
78	[_random_blocks(shape[block_axis], block_size, circular) for _ in range(np.prod(non_block_shapes))],
79	axis=-1,
80	).reshape([shape[block_axis]] + non_block_shapes),
81	0,
82	block_axis,
83	)
84
85	if len(prev_block_sizes) > 0:	4✔
86	for i, b in enumerate(prev_block_sizes[::-1]):	4✔
87	prev_ax = block_axis - (i + 1)	4✔
88	indices = np.repeat(indices, b, axis=prev_ax).take(range(orig_shape[prev_ax]), axis=prev_ax)	4✔
89	return indices	4✔
90	return indices	4✔
91
92
93	def _n_nested_blocked_random_indices(	4✔
94	sizes: OrderedDict[str, Tuple[int, int]], n_iteration: int, circular: bool = True
95	) -> OrderedDict[str, np.ndarray]:
96	"""
97	Returns indices to randomly resample blocks of an array (with replacement)
98	in a nested manner many times. Here, "nested" resampling means to randomly
99	resample the first dimension, then for each randomly sampled element along
100	that dimension, randomly resample the second dimension, then for each
101	randomly sampled element along that dimension, randomly resample the third
102	dimension etc.
103
104	Args:
105	sizes: Dictionary with {names: (sizes, blocks)} of the dimensions to resample
106	n_iteration: The number of times to repeat the random resampling
107	circular: Whether or not to do circular resampling.
108
109	Returns:
110	A dictionary of arrays containing indices for nested block resampling.
111
112	"""
113
114	shape = [s[0] for s in sizes.values()]	4✔
115	indices = OrderedDict()	4✔
116	prev_blocks: List[int] = []	4✔
117	for ax, (key, (_, block)) in enumerate(sizes.items()):	4✔
118	indices[key] = _get_blocked_random_indices(shape[: ax + 1] + [n_iteration], ax, block, prev_blocks, circular)	4✔
119	prev_blocks.append(block)	4✔
120	return indices	4✔
121
122
123	def _expand_n_nested_random_indices(indices: list[np.ndarray]) -> Tuple[np.ndarray, ...]:	4✔
124	"""
125	Expand the dimensions of the nested input arrays so that they can be
126	broadcast and return a tuple that can be directly indexed
127
128	Args:
129	indices:List of numpy arrays of sequentially increasing dimension as output by
130	the function ``_n_nested_blocked_random_indices``. The last axis on all
131	inputs is assumed to correspond to the iteration axis
132
133	Returns:
134	Expanded indices suitable for broadcasting.
135	"""
136	broadcast_ndim = indices[-1].ndim	4✔
137	broadcast_indices = []	4✔
138	for i, ind in enumerate(indices):	4✔
139	expand_axes = list(range(i + 1, broadcast_ndim - 1))	4✔
140	broadcast_indices.append(np.expand_dims(ind, axis=expand_axes))	4✔
141	return (..., *tuple(broadcast_indices))	4✔
142
143
144	def _bootstrap(*arrays: np.ndarray, indices: List[np.ndarray]) -> Union[np.ndarray, Tuple[np.ndarray, ...]]:	4✔
145	"""
146	Bootstrap the array(s) using the provided indices
147
148	Args:
149	arrays: list of arrays to bootstrap
150	indices: list of arrays containing indices to use for bootstrapping each input array
151
152	Returns:
153	Bootstrapped arrays
154	"""
155	bootstrapped = [array[ind] for array, ind in zip(arrays, indices)]	4✔
156	if len(bootstrapped) == 1:	4✔
157	return bootstrapped[0]	4✔
158	return tuple(bootstrapped)	4✔
159
160
161	def _block_bootstrap( # pylint: disable=too-many-locals	4✔
162	array_list: List[XarrayLike],
163	blocks: Dict[str, int],
164	n_iteration: int,
165	exclude_dims: Union[List[List[str]], None] = None,
166	circular: bool = True,
167	) -> Tuple[xr.DataArray, ...]:
168	"""
169	Repeatedly performs bootstrapping on provided arrays across specified dimensions, stacking
170	the new arrays along a new "iteration" dimension. Bootstrapping is executed in a nested
171	manner: the first provided dimension is bootstrapped, then for each bootstrapped sample
172	along that dimension, the second provided dimension is bootstrapped, and so forth.
173
174	Args:
175	array_list: Data to bootstrap. Multiple arrays can be passed to be bootstrapped
176	in the same way. All input arrays must have nested dimensions.
177	blocks: Dictionary of dimension(s) to bootstrap and the block sizes to use
178	along each dimension: ``{dim: blocksize}``. Nesting is based on the order of
179	this dictionary.
180	n_iteration: The number of iterations to repeat the bootstrapping process. Determines
181	how many bootstrapped arrays will be generated and stacked along the iteration
182	dimension.
183	exclude_dims: An optional parameter indicating the dimensions to be excluded during
184	bootstrapping for each arrays provided in ``arrays``. This parameter expects a list
185	of lists, where each inner list corresponds to the dimensions to be excluded for
186	the respective arrays. By default, the assumption is that no dimensions are
187	excluded, and all arrays are bootstrapped across all specified dimensions in ``blocks``.
188	circular: A boolean flag indicating whether circular block bootstrapping should be
189	performed. Circular bootstrapping means that bootstrapping continues from the beginning
190	when the end of the data is reached. By default, this parameter is set to True.
191
192	Returns:
193	Tuple of bootstrapped xarray DataArrays or Datasets, based on the input.
194
195	Note:
196	This function expands out the iteration dimension inside a universal function.
197	However, this may generate very large chunks (multiplying chunk size by the number
198	of iterations), causing issues for larger iterations. It's advisable to apply this
199	function in blocks using 'block_bootstrap'.
200
201	References:
202	Wilks, Daniel S. Statistical methods in the atmospheric sciences. Vol. 100.
203	Academic press, 2011.
204	"""
205	# Rename exclude_dims so they are not bootstrapped
206	if exclude_dims is None:	4✔
207	exclude_dims = [[] for _ in range(len(array_list))]	4✔
208	if not isinstance(exclude_dims, list) or not all(isinstance(x, list) for x in exclude_dims):	4✔
209	raise ValueError("exclude_dims should be a list of lists")	4✔
210	if len(exclude_dims) != len(array_list):	4✔
211	raise ValueError(	4✔
212	"exclude_dims should be a list of the same length as the number of "
213	"arrays containing lists of dimensions to exclude for each array"
214	)
215	renames = []	4✔
216
217	for i, (obj, exclude) in enumerate(zip(array_list, exclude_dims)):	4✔
218	array_list[i] = obj.rename(	4✔
219	{d: f"dim{ii}" for ii, d in enumerate(exclude)},
220	)
221	renames.append({f"dim{ii}": d for ii, d in enumerate(exclude)})	4✔
222	dim = list(blocks.keys())	4✔
223
224	# Ensure bootstrapped dimensions have consistent sizes across arrays_list
225	for d in blocks.keys():	4✔
226	dim_sizes = [o.sizes[d] for o in array_list if d in o.dims]	4✔
227	if not all(s == dim_sizes[0] for s in dim_sizes):	4✔
228	raise ValueError(f"Block dimension {d} is not the same size on all input arrays")	4✔
229
230	# Get the sizes of the bootstrap dimensions
231	sizes = None	4✔
232	for obj in array_list:	4✔
233	try:	4✔
234	sizes = OrderedDict({d: (obj.sizes[d], b) for d, b in blocks.items()})	4✔
235	break	4✔
236	except KeyError:	4✔
237	pass	4✔
238	if sizes is None:	4✔
239	raise ValueError(	4✔
240	"At least one input array must contain all dimensions in blocks.keys()",
241	)
242
243	# Generate random indices for bootstrapping all arrays_list
244	nested_indices = _n_nested_blocked_random_indices(sizes, n_iteration, circular)	4✔
245
246	# Expand indices for broadcasting for each array separately
247	indices = []	4✔
248	input_core_dims = []	4✔
249	for obj in array_list:	4✔
250	available_dims = [d for d in dim if d in obj.dims]	4✔
251	indices_to_expand = [nested_indices[key] for key in available_dims]	4✔
252
253	indices.append(_expand_n_nested_random_indices(indices_to_expand))	4✔
254	input_core_dims.append(available_dims)	4✔
255
256	# Process arrays_list separately to handle non-matching dimensions
257	result = []	4✔
258	for obj, ind, core_dims in zip(array_list, indices, input_core_dims):	4✔
259	if isinstance(obj, xr.Dataset):	4✔
260	# Assume all variables have the same dtype
261	output_dtype = obj[list(obj.data_vars)[0]].dtype	4✔
262	else:
263	output_dtype = obj.dtype	4✔
264
265	result.append(	4✔
266	xr.apply_ufunc(
267	_bootstrap,
268	obj,
269	kwargs={"indices": [ind]},
270	input_core_dims=[core_dims],
271	output_core_dims=[core_dims + ["iteration"]],
272	dask="parallelized",
273	dask_gufunc_kwargs={"output_sizes": {"iteration": n_iteration}},
274	output_dtypes=[output_dtype],
275	)
276	)
277
278	# Rename excluded dimensions
279	return tuple(res.rename(rename) for res, rename in zip(result, renames))	4✔
280
281
282	def block_bootstrap(	4✔
283	array_list: List[XarrayLike] \| XarrayLike,
284	*, # Enforce keyword-only arguments
285	blocks: Dict[str, int],
286	n_iteration: int,
287	exclude_dims: Union[List[List[str]], None] = None,
288	circular: bool = True,
289	) -> Union[XarrayLike, Tuple[XarrayLike, ...]]:
290	"""
291	Perform block bootstrapping on provided arrays. The function creates new arrays by repeatedly
292	bootstrapping along specified dimensions and stacking the new arrays alone a new "iteration"
293	dimension. Additionally, it includes internal functions for chunk size calculation and
294	handling Dask arrays for chunk size limitation.
295
296	Args:
297	array_list: The data to bootstrap, which can be a single xarray object or
298	a list of multiple xarray objects. In the case where
299	multiple datasets are passed, each dataset can have its own set of dimension. However,
300	for successful bootstrapping, dimensions across all input arrays must be nested.
301	For instance, for ``block.keys=['d1', 'd2', 'd3']``, an array with dimension 'd1' and
302	'd2' is valid, but an array with only dimension 'd2' is not valid. All datasets
303	are bootstrapped according to the same random samples along available dimensions.
304	blocks: A dictionary specifying the dimension(s) to bootstrap and the block sizes to
305	use along each dimension: ``{dimension: block_size}``. The keys represent the dimensions
306	to be bootstrapped, and the values indicate the block sizes along each dimension.
307	The dimension provided here should exist in the data provided in ``array_list``.
308	n_iteration: The number of iterations to repeat the bootstrapping process. Determines
309	how many bootstrapped arrays will be generated and stacked along the iteration
310	dimension.
311	exclude_dims: An optional parameter indicating the dimensions to be excluded during
312	bootstrapping for each array provided in ``array_list``. This parameter expects a list
313	of lists, where each inner list corresponds to the dimensions to be excluded for
314	the respective array. By default, the assumption is that no dimensions are
315	excluded, and all arrays are bootstrapped across all specified dimensions in ``blocks``.
316	circular: A boolean flag indicating whether circular block bootstrapping should be
317	performed. Circular bootstrapping means that bootstrapping continues from the beginning
318	when the end of the data is reached. By default, this parameter is set to True.
319
320	Returns:
321	If a single Dataset/DataArray (XarrayLike) is provided, the functions returns a
322	bootstrapped XarrayLike object along the "iteration" dimension. If multiple XarrayLike
323	objects are provided, it returns a tuple of bootstrapped XarrayLike objects, each stacked
324	along the "iteration" dimension.
325
326	References:
327	- Gilleland, E. (2020). Bootstrap Methods for Statistical Inference. Part I:
328	Comparative Forecast Verification for Continuous Variables. Journal of
329	Atmospheric and Oceanic Technology, 37(11), 2117–2134. https://doi.org/10.1175/jtech-d-20-0069.1
330	- Wilks, D. S. (2011). Statistical methods in the atmospheric sciences. Academic press.
331	https://doi.org/10.1016/C2017-0-03921-6
332	"""
333
334	# While the most efficient method involves expanding the iteration dimension withing the
335	# universal function, this approach might generate excessively large chunks (resulting
336	# from multiplying chunk size by iterations) leading to issues with large numbers of
337	# iterations. Hence, here function loops over blocks of iterations to generate the total
338	# number of iterations.
339	def _max_chunk_size_mb(ds):	4✔
340	"""
341	Get the max chunk size in a dataset
342	"""
343	ds = ds if isinstance(ds, xr.Dataset) else ds.to_dataset(name="ds")	4✔
344
345	chunks = []	4✔
346	for var in ds.data_vars:	4✔
347	da = ds[var]	4✔
348	chunk = da.chunks	4✔
349	itemsize = da.data.itemsize	4✔
350	size_of_chunk = itemsize * np.prod([np.max(x) for x in chunk]) / (1024**2)	4✔
351	chunks.append(size_of_chunk)	4✔
352	return max(chunks)	4✔
353
354	if not isinstance(array_list, List):	4✔
355	array_list = [array_list]	4✔
356	# Choose iteration blocks to limit chunk size on dask arrays
357	if array_list[0].chunks: # Note: This is a way to check if the array is backed by a dask.array	4✔
358	# without loading data into memory.
359	# See https://docs.xarray.dev/en/stable/generated/xarray.DataArray.chunks.html
360	ds_max_chunk_size_mb = max(_max_chunk_size_mb(obj) for obj in array_list)	4✔
361	blocksize = int(MAX_CHUNK_SIZE_MB / ds_max_chunk_size_mb)	4✔
362	blocksize = min(blocksize, n_iteration)	4✔
363	blocksize = max(blocksize, 1)	4✔
364	else:
365	blocksize = n_iteration	4✔
366
367	bootstraps = []	4✔
368	for _ in range(blocksize, n_iteration + 1, blocksize):	4✔
369	bootstraps.append(	4✔
370	_block_bootstrap(
371	array_list,
372	blocks=blocks,
373	n_iteration=blocksize,
374	exclude_dims=exclude_dims,
375	circular=circular,
376	)
377	)
378	leftover = n_iteration % blocksize	4✔
379
380	if leftover:	4✔
NEW 381	bootstraps.append(	×
382	_block_bootstrap(
383	array_list,
384	blocks=blocks,
385	n_iteration=leftover,
386	exclude_dims=exclude_dims,
387	circular=circular,
388	)
389	)
390
391	bootstraps_concat = tuple(	4✔
392	xr.concat(
393	bootstrap,
394	dim="iteration",
395	coords="minimal",
396	compat="override",
397	)
398	for bootstrap in zip(*bootstraps)
399	)
400
401	if len(array_list) == 1:	4✔
402	return bootstraps_concat[0]	4✔
403	return bootstraps_concat	4✔

nci / scores / 13171996478

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous