16336054633

Committed 17 Jul 2025 04:13AM UTC coverage: 99.449% (-0.6%) from 100.0%

Build # 16336054633

Build Type

Pull #856

github

Committed by

tennlee

Commit Message

Allow tests to run for both dask and non-dask environments
Some pylint issues are reported, but I think these are mostly unrelated, further code refactoring may be needed to bring the code up to full health according to pylint

Pull Request Pull Request #856: [WIP] Branch to track progress removing unexpected reliance on dask

Run Details

390 of 391 branches covered (99.74%)

Branch coverage included in aggregate %.

2496 of 2511 relevant lines covered (99.4%)

3.98 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

90.32

/src/scores/processing/block_bootstrap_impl.py

"""
Functions for performing block bootstrapping of arrays. This is inspired from
https://github.com/dougiesquire/xbootstrap with modifications to make the functions more
testable and also consistent with the scores package.
"""

import math
import os
from collections import OrderedDict
from itertools import chain, cycle, islice
from typing import Dict, List, Tuple, Union

import numpy as np
import xarray as xr

from scores.typing import XarrayLike
from scores.utils import tmp_coord_name

# When Dask is being used, this constant helps control the sizes of batches
# when bootstrapping
MAX_BATCH_SIZE_MB = 200


def _get_blocked_random_indices(
    shape: list[int], block_axis: int, block_size: int, prev_block_sizes: list[int], circular: bool = True
) -> np.ndarray:
    """
    Return indices to randomly sample an axis of an array in consecutive
    (cyclic) blocks.

    Args:
        shape: The shape of the array to sample
        block_axis: The axis along which to sample blocks
        block_size: The size of each block to sample
        prev_block_sizes: Sizes of previous blocks along other axes
        circular: whether to sample block circularly.

    Returns:
        An array of indices to use for block resampling.
    """

    def _random_blocks(length, block, circular):
        """
        Indices to randomly sample blocks in a along an axis of a specified
        length
        """
        if block == length:
            return list(range(length))
        repeats = math.ceil(length / block)
        if circular:
            indices = list(
                chain.from_iterable(
                    islice(cycle(range(length)), s, s + block) for s in np.random.randint(0, length, repeats)
                )
            )
        else:
            indices = list(
                chain.from_iterable(
                    islice(range(length), s, s + block) for s in np.random.randint(0, length - block + 1, repeats)
                )
            )
        return indices[:length]

    # Don't randomise within an outer block
    if len(prev_block_sizes) > 0:
        orig_shape = shape.copy()
        for i, b in enumerate(prev_block_sizes[::-1]):
            prev_ax = block_axis - (i + 1)
            shape[prev_ax] = math.ceil(shape[prev_ax] / b)

    if block_size == 1:
        indices = np.random.randint(
            0,
            shape[block_axis],
            shape,
        )
    else:
        non_block_shapes = [s for i, s in enumerate(shape) if i != block_axis]
        indices = np.moveaxis(
            np.stack(
                [_random_blocks(shape[block_axis], block_size, circular) for _ in range(np.prod(non_block_shapes))],
                axis=-1,
            ).reshape([shape[block_axis]] + non_block_shapes),
            0,
            block_axis,
        )

    if len(prev_block_sizes) > 0:
        for i, b in enumerate(prev_block_sizes[::-1]):
            prev_ax = block_axis - (i + 1)
            indices = np.repeat(indices, b, axis=prev_ax).take(range(orig_shape[prev_ax]), axis=prev_ax)
        return indices
    return indices


def _n_nested_blocked_random_indices(
    sizes: OrderedDict[str, Tuple[int, int]], n_iteration: int, circular: bool = True
) -> OrderedDict[str, np.ndarray]:
    """
    Returns indices to randomly resample blocks of an array (with replacement)
    in a nested manner many times. Here, "nested" resampling means to randomly
    resample the first dimension, then for each randomly sampled element along
    that dimension, randomly resample the second dimension, then for each
    randomly sampled element along that dimension, randomly resample the third
    dimension etc.

    Args:
    sizes: Dictionary with {names: (sizes, blocks)} of the dimensions to resample
    n_iteration: The number of times to repeat the random resampling
    circular: Whether or not to do circular resampling.

    Returns:
        A dictionary of arrays containing indices for nested block resampling.

    """

    shape = [s[0] for s in sizes.values()]
    indices = OrderedDict()
    prev_blocks: List[int] = []
    for ax, (key, (_, block)) in enumerate(sizes.items()):
        indices[key] = _get_blocked_random_indices(shape[: ax + 1] + [n_iteration], ax, block, prev_blocks, circular)
        prev_blocks.append(block)
    return indices


def _expand_n_nested_random_indices(indices: list[np.ndarray]) -> Tuple[np.ndarray, ...]:
    """
    Expand the dimensions of the nested input arrays so that they can be
    broadcast and return a tuple that can be directly indexed

    Args:
    indices:List of numpy arrays of sequentially increasing dimension as output by
        the function ``_n_nested_blocked_random_indices``. The last axis on all
        inputs is assumed to correspond to the iteration axis

    Returns:
        Expanded indices suitable for broadcasting.
    """
    broadcast_ndim = indices[-1].ndim
    broadcast_indices = []
    for i, ind in enumerate(indices):
        expand_axes = list(range(i + 1, broadcast_ndim - 1))
        broadcast_indices.append(np.expand_dims(ind, axis=expand_axes))
    return (..., *tuple(broadcast_indices))


def _bootstrap(*arrays: np.ndarray, indices: List[np.ndarray]) -> Union[np.ndarray, Tuple[np.ndarray, ...]]:
    """
    Bootstrap the array(s) using the provided indices

    Args:
        arrays: list of arrays to bootstrap
        indices: list of arrays containing indices to use for bootstrapping each input array

    Returns:
        Bootstrapped arrays
    """
    bootstrapped = [array[ind] for array, ind in zip(arrays, indices)]
    if len(bootstrapped) == 1:
        return bootstrapped[0]
    return tuple(bootstrapped)


def _block_bootstrap(  # pylint: disable=too-many-locals
    array_list: List[XarrayLike],
    blocks: Dict[str, int],
    n_iteration: int,
    exclude_dims: Union[List[List[str]], None] = None,
    circular: bool = True,
) -> Tuple[xr.DataArray, ...]:
    """
    Repeatedly performs bootstrapping on provided arrays across specified dimensions, stacking
    the new arrays along a new "iteration" dimension. Bootstrapping is executed in a nested
    manner: the first provided dimension is bootstrapped, then for each bootstrapped sample
    along that dimension, the second provided dimension is bootstrapped, and so forth.

    Args:
        array_list: Data to bootstrap. Multiple arrays can be passed to be bootstrapped
            in the same way. All input arrays must have nested dimensions.
        blocks: Dictionary of dimension(s) to bootstrap and the block sizes to use
            along each dimension: ``{dim: blocksize}``. Nesting is based on the order of
            this dictionary.
        n_iteration: The number of iterations to repeat the bootstrapping process. Determines
            how many bootstrapped arrays will be generated and stacked along the iteration
            dimension.
        exclude_dims: An optional parameter indicating the dimensions to be excluded during
            bootstrapping for each arrays provided in ``arrays``. This parameter expects a list
            of lists, where each inner list corresponds to the dimensions to be excluded for
            the respective arrays. By default, the assumption is that no dimensions are
            excluded, and all arrays are bootstrapped across all specified dimensions in ``blocks``.
        circular: A boolean flag indicating whether circular block bootstrapping should be
            performed. Circular bootstrapping means that bootstrapping continues from the beginning
            when the end of the data is reached. By default, this parameter is set to True.

     Returns:
        Tuple of bootstrapped xarray DataArrays or Datasets, based on the input.

    Note:
        This function expands out the iteration dimension inside a universal function.
        However, this may generate very large chunks (multiplying chunk size by the number
        of iterations), causing issues for larger iterations. It's advisable to apply this
        function in blocks using 'block_bootstrap'.

    References:
    Wilks, Daniel S. Statistical methods in the atmospheric sciences. Vol. 100.
      Academic press, 2011.
    """
    # Rename exclude_dims so they are not bootstrapped
    if exclude_dims is None:
        exclude_dims = [[] for _ in range(len(array_list))]
    if not isinstance(exclude_dims, list) or not all(isinstance(x, list) for x in exclude_dims):
        raise ValueError("exclude_dims should be a list of lists")
    if len(exclude_dims) != len(array_list):
        raise ValueError(
            "exclude_dims should be a list of the same length as the number of arrays in array_list",
        )
    renames = []
    for i, (obj, exclude) in enumerate(zip(array_list, exclude_dims)):
        new_dim_list = tmp_coord_name(obj, count=len(exclude))
        if isinstance(new_dim_list, str):
            new_dim_list = [new_dim_list]
        rename_dict = {d: f"{new_dim_list[ii]}" for ii, d in enumerate(exclude)}
        array_list[i] = obj.rename(rename_dict)
        renames.append({v: k for k, v in rename_dict.items()})

    dim = list(blocks.keys())

    # Ensure bootstrapped dimensions have consistent sizes across arrays_list
    for d in blocks.keys():
        dim_sizes = [o.sizes[d] for o in array_list if d in o.dims]
        if not all(s == dim_sizes[0] for s in dim_sizes):
            raise ValueError(f"Block dimension {d} is not the same size on all input arrays")

    # Get the sizes of the bootstrap dimensions
    sizes = None
    for obj in array_list:
        try:
            sizes = OrderedDict({d: (obj.sizes[d], b) for d, b in blocks.items()})
            break
        except KeyError:
            pass
    if sizes is None:
        raise ValueError(
            "At least one input array must contain all dimensions in blocks.keys()",
        )

    # Generate random indices for bootstrapping all arrays_list
    nested_indices = _n_nested_blocked_random_indices(sizes, n_iteration, circular)

    # Expand indices for broadcasting for each array separately
    indices = []
    input_core_dims = []
    for obj in array_list:
        available_dims = [d for d in dim if d in obj.dims]
        indices_to_expand = [nested_indices[key] for key in available_dims]

        indices.append(_expand_n_nested_random_indices(indices_to_expand))
        input_core_dims.append(available_dims)

    # Process arrays_list separately to handle non-matching dimensions
    result = []
    for obj, ind, core_dims in zip(array_list, indices, input_core_dims):
        if isinstance(obj, xr.Dataset):
            # Assume all variables have the same dtype
            output_dtype = obj[list(obj.data_vars)[0]].dtype
        else:
            output_dtype = obj.dtype

        result.append(
            xr.apply_ufunc(
                _bootstrap,
                obj,
                kwargs={"indices": [ind]},
                input_core_dims=[core_dims],
                output_core_dims=[core_dims + ["iteration"]],
                dask="parallelized",
                dask_gufunc_kwargs={"output_sizes": {"iteration": n_iteration}},
                output_dtypes=[output_dtype],
            )
        )

    # Rename excluded dimensions
    return tuple(res.rename(rename) for res, rename in zip(result, renames))


def block_bootstrap(
    array_list: List[XarrayLike] | XarrayLike,
    *,  # Enforce keyword-only arguments
    blocks: Dict[str, int],
    n_iteration: int,
    exclude_dims: Union[List[List[str]], None] = None,
    circular: bool = True,
) -> Union[XarrayLike, Tuple[XarrayLike, ...]]:
    """
    Perform block bootstrapping on provided arrays. The function creates new arrays by repeatedly
    bootstrapping along specified dimensions and stacking the new arrays along a new "iteration"
    dimension. Additionally, it includes internal functions for chunk size calculation and
    handling Dask arrays for chunk size limitation.

    Args:
        array_list: The data to bootstrap, which can be a single xarray object or
            a list of multiple xarray objects. In the case where
            multiple datasets are passed, each dataset can have its own set of dimension. However,
            for successful bootstrapping, dimensions across all input arrays must be nested.
            For instance, for ``block.keys=['d1', 'd2', 'd3']``, an array with dimension 'd1' and
            'd2' is valid, but an array with only dimension 'd2' is not valid. All datasets
            are bootstrapped according to the same random samples along available dimensions.
        blocks: A dictionary specifying the dimension(s) to bootstrap and the block sizes to
            use along each dimension: ``{dimension: block_size}``. The keys represent the dimensions
            to be bootstrapped, and the values indicate the block sizes along each dimension.
            The dimension provided here should exist in the data provided in ``array_list``.
        n_iteration: The number of iterations to repeat the bootstrapping process. Determines
            how many bootstrapped arrays will be generated and stacked along the iteration
            dimension.
        exclude_dims: An optional parameter indicating the dimensions to be excluded during
            bootstrapping for each array provided in ``array_list``. This parameter expects a list
            of lists, where each inner list corresponds to the dimensions to be excluded for
            the respective array. By default, the assumption is that no dimensions are
            excluded, and all arrays are bootstrapped across all specified dimensions in ``blocks``.
        circular: A boolean flag indicating whether circular block bootstrapping should be
            performed. Circular bootstrapping means that bootstrapping continues from the beginning
            when the end of the data is reached. By default, this parameter is set to True.

    Returns:
        If a single Dataset/DataArray (XarrayLike) is provided, the functions returns a
        bootstrapped XarrayLike object along the "iteration" dimension. If multiple XarrayLike
        objects are provided, it returns a tuple of bootstrapped XarrayLike objects, each stacked
        along the "iteration" dimension.

    Raises:
        ValueError: If bootstrapped dimensions don't consistent sizes across ``arrays_list``.
        ValueError: If there is not at least one input array that contains all dimensions in blocks.keys().
        ValueError: If ``exclude_dims`` is not a list of lists.
        ValueError: If the list ``exclude_dims`` is not the same length as the number of
            as ``array_list``.

    References:
        - Gilleland, E. (2020). Bootstrap Methods for Statistical Inference. Part I:
          Comparative Forecast Verification for Continuous Variables. Journal of
          Atmospheric and Oceanic Technology, 37(11), 2117–2134. https://doi.org/10.1175/jtech-d-20-0069.1
        - Wilks, D. S. (2011). Statistical methods in the atmospheric sciences. Academic press.
          https://doi.org/10.1016/C2017-0-03921-6

    Examples:
        Bootstrap a fcst and obs dataset along the time and space dimensions with block sizes of 10
        for each dimension. The bootstrapping is repeated 1000 times.

        >>> import numpy as np
        >>> import xarray as xr
        >>> from scores.processing import block_bootstrap
        >>> obs = xr.DataArray(np.random.rand(100, 100), dims=["time", "space"])
        >>> fcst = xr.DataArray(np.random.rand(100, 100), dims=["time", "space"])
        >>> bootstrapped_obs, bootstrapped_fcst = block_bootstrap(
        ...     [obs, fcst],
        ...     blocks={"time": 10, "space": 10},
        ...     n_iteration=1000,
        ... )
    """

    # While the most efficient method involves expanding the iteration dimension withing the
    # universal function, this approach might generate excessively large chunks (resulting
    # from multiplying chunk size by iterations) leading to issues with large numbers of
    # iterations. Hence, here function loops over blocks of iterations to generate the total
    # number of iterations.
    def _max_chunk_size_mb(ds):
        """
        Get the max chunk size in a dataset
        """
        ds = ds if isinstance(ds, xr.Dataset) else ds.to_dataset(name="ds")

        chunks = []
        for var in ds.data_vars:
            da = ds[var]
            chunk = da.chunks
            itemsize = da.data.itemsize
            size_of_chunk = itemsize * np.prod([np.max(x) for x in chunk]) / (1024**2)
            chunks.append(size_of_chunk)
        return max(chunks)

    if not isinstance(array_list, List):
        array_list = [array_list]
    # Choose iteration blocks to limit chunk size on dask arrays
    if array_list[0].chunks:  # Note: This is a way to check if the array is backed by a dask.array
        # without loading data into memory.
        # See https://docs.xarray.dev/en/stable/generated/xarray.DataArray.chunks.html
        ds_max_chunk_size_mb = max(_max_chunk_size_mb(obj) for obj in array_list)
        blocksize = int(MAX_BATCH_SIZE_MB / ds_max_chunk_size_mb)
        blocksize = min(blocksize, n_iteration)
        blocksize = max(blocksize, 1)
    else:
        blocksize = n_iteration

    bootstraps = []
    for _ in range(blocksize, n_iteration + 1, blocksize):
        bootstraps.append(
            _block_bootstrap(
                array_list,
                blocks=blocks,
                n_iteration=blocksize,
                exclude_dims=exclude_dims,
                circular=circular,
            )
        )
    leftover = n_iteration % blocksize

    if leftover:
        bootstraps.append(
            _block_bootstrap(
                array_list,
                blocks=blocks,
                n_iteration=leftover,
                exclude_dims=exclude_dims,
                circular=circular,
            )
        )

    bootstraps_concat = tuple(
        xr.concat(
            bootstrap,
            dim="iteration",
            coords="minimal",
            compat="override",
        )
        for bootstrap in zip(*bootstraps)
    )

    if len(array_list) == 1:
        return bootstraps_concat[0]
    return bootstraps_concat

1	"""
2	Functions for performing block bootstrapping of arrays. This is inspired from
3	https://github.com/dougiesquire/xbootstrap with modifications to make the functions more
4	testable and also consistent with the scores package.
5	"""
6
7	import math	4✔
8	import os	4✔
9	from collections import OrderedDict	4✔
10	from itertools import chain, cycle, islice	4✔
11	from typing import Dict, List, Tuple, Union	4✔
12
13	import numpy as np	4✔
14	import xarray as xr	4✔
15
16	from scores.typing import XarrayLike	4✔
17	from scores.utils import tmp_coord_name	4✔
18
19	# When Dask is being used, this constant helps control the sizes of batches
20	# when bootstrapping
21	MAX_BATCH_SIZE_MB = 200	4✔
22
23
24	def _get_blocked_random_indices(	4✔
25	shape: list[int], block_axis: int, block_size: int, prev_block_sizes: list[int], circular: bool = True
26	) -> np.ndarray:
27	"""
28	Return indices to randomly sample an axis of an array in consecutive
29	(cyclic) blocks.
30
31	Args:
32	shape: The shape of the array to sample
33	block_axis: The axis along which to sample blocks
34	block_size: The size of each block to sample
35	prev_block_sizes: Sizes of previous blocks along other axes
36	circular: whether to sample block circularly.
37
38	Returns:
39	An array of indices to use for block resampling.
40	"""
41
42	def _random_blocks(length, block, circular):	4✔
43	"""
44	Indices to randomly sample blocks in a along an axis of a specified
45	length
46	"""
47	if block == length:	4✔
48	return list(range(length))	4✔
49	repeats = math.ceil(length / block)	4✔
50	if circular:	4✔
51	indices = list(	4✔
52	chain.from_iterable(
53	islice(cycle(range(length)), s, s + block) for s in np.random.randint(0, length, repeats)
54	)
55	)
56	else:
57	indices = list(	4✔
58	chain.from_iterable(
59	islice(range(length), s, s + block) for s in np.random.randint(0, length - block + 1, repeats)
60	)
61	)
62	return indices[:length]	4✔
63
64	# Don't randomise within an outer block
65	if len(prev_block_sizes) > 0:	4✔
66	orig_shape = shape.copy()	4✔
67	for i, b in enumerate(prev_block_sizes[::-1]):	4✔
68	prev_ax = block_axis - (i + 1)	4✔
69	shape[prev_ax] = math.ceil(shape[prev_ax] / b)	4✔
70
71	if block_size == 1:	4✔
72	indices = np.random.randint(	4✔
73	0,
74	shape[block_axis],
75	shape,
76	)
77	else:
78	non_block_shapes = [s for i, s in enumerate(shape) if i != block_axis]	4✔
79	indices = np.moveaxis(	4✔
80	np.stack(
81	[_random_blocks(shape[block_axis], block_size, circular) for _ in range(np.prod(non_block_shapes))],
82	axis=-1,
83	).reshape([shape[block_axis]] + non_block_shapes),
84	0,
85	block_axis,
86	)
87
88	if len(prev_block_sizes) > 0:	4✔
89	for i, b in enumerate(prev_block_sizes[::-1]):	4✔
90	prev_ax = block_axis - (i + 1)	4✔
91	indices = np.repeat(indices, b, axis=prev_ax).take(range(orig_shape[prev_ax]), axis=prev_ax)	4✔
92	return indices	4✔
93	return indices	4✔
94
95
96	def _n_nested_blocked_random_indices(	4✔
97	sizes: OrderedDict[str, Tuple[int, int]], n_iteration: int, circular: bool = True
98	) -> OrderedDict[str, np.ndarray]:
99	"""
100	Returns indices to randomly resample blocks of an array (with replacement)
101	in a nested manner many times. Here, "nested" resampling means to randomly
102	resample the first dimension, then for each randomly sampled element along
103	that dimension, randomly resample the second dimension, then for each
104	randomly sampled element along that dimension, randomly resample the third
105	dimension etc.
106
107	Args:
108	sizes: Dictionary with {names: (sizes, blocks)} of the dimensions to resample
109	n_iteration: The number of times to repeat the random resampling
110	circular: Whether or not to do circular resampling.
111
112	Returns:
113	A dictionary of arrays containing indices for nested block resampling.
114
115	"""
116
117	shape = [s[0] for s in sizes.values()]	4✔
118	indices = OrderedDict()	4✔
119	prev_blocks: List[int] = []	4✔
120	for ax, (key, (_, block)) in enumerate(sizes.items()):	4✔
121	indices[key] = _get_blocked_random_indices(shape[: ax + 1] + [n_iteration], ax, block, prev_blocks, circular)	4✔
122	prev_blocks.append(block)	4✔
123	return indices	4✔
124
125
126	def _expand_n_nested_random_indices(indices: list[np.ndarray]) -> Tuple[np.ndarray, ...]:	4✔
127	"""
128	Expand the dimensions of the nested input arrays so that they can be
129	broadcast and return a tuple that can be directly indexed
130
131	Args:
132	indices:List of numpy arrays of sequentially increasing dimension as output by
133	the function ``_n_nested_blocked_random_indices``. The last axis on all
134	inputs is assumed to correspond to the iteration axis
135
136	Returns:
137	Expanded indices suitable for broadcasting.
138	"""
139	broadcast_ndim = indices[-1].ndim	4✔
140	broadcast_indices = []	4✔
141	for i, ind in enumerate(indices):	4✔
142	expand_axes = list(range(i + 1, broadcast_ndim - 1))	4✔
143	broadcast_indices.append(np.expand_dims(ind, axis=expand_axes))	4✔
144	return (..., *tuple(broadcast_indices))	4✔
145
146
147	def _bootstrap(*arrays: np.ndarray, indices: List[np.ndarray]) -> Union[np.ndarray, Tuple[np.ndarray, ...]]:	4✔
148	"""
149	Bootstrap the array(s) using the provided indices
150
151	Args:
152	arrays: list of arrays to bootstrap
153	indices: list of arrays containing indices to use for bootstrapping each input array
154
155	Returns:
156	Bootstrapped arrays
157	"""
158	bootstrapped = [array[ind] for array, ind in zip(arrays, indices)]	4✔
159	if len(bootstrapped) == 1:	4✔
160	return bootstrapped[0]	4✔
161	return tuple(bootstrapped)	4✔
162
163
164	def _block_bootstrap( # pylint: disable=too-many-locals	4✔
165	array_list: List[XarrayLike],
166	blocks: Dict[str, int],
167	n_iteration: int,
168	exclude_dims: Union[List[List[str]], None] = None,
169	circular: bool = True,
170	) -> Tuple[xr.DataArray, ...]:
171	"""
172	Repeatedly performs bootstrapping on provided arrays across specified dimensions, stacking
173	the new arrays along a new "iteration" dimension. Bootstrapping is executed in a nested
174	manner: the first provided dimension is bootstrapped, then for each bootstrapped sample
175	along that dimension, the second provided dimension is bootstrapped, and so forth.
176
177	Args:
178	array_list: Data to bootstrap. Multiple arrays can be passed to be bootstrapped
179	in the same way. All input arrays must have nested dimensions.
180	blocks: Dictionary of dimension(s) to bootstrap and the block sizes to use
181	along each dimension: ``{dim: blocksize}``. Nesting is based on the order of
182	this dictionary.
183	n_iteration: The number of iterations to repeat the bootstrapping process. Determines
184	how many bootstrapped arrays will be generated and stacked along the iteration
185	dimension.
186	exclude_dims: An optional parameter indicating the dimensions to be excluded during
187	bootstrapping for each arrays provided in ``arrays``. This parameter expects a list
188	of lists, where each inner list corresponds to the dimensions to be excluded for
189	the respective arrays. By default, the assumption is that no dimensions are
190	excluded, and all arrays are bootstrapped across all specified dimensions in ``blocks``.
191	circular: A boolean flag indicating whether circular block bootstrapping should be
192	performed. Circular bootstrapping means that bootstrapping continues from the beginning
193	when the end of the data is reached. By default, this parameter is set to True.
194
195	Returns:
196	Tuple of bootstrapped xarray DataArrays or Datasets, based on the input.
197
198	Note:
199	This function expands out the iteration dimension inside a universal function.
200	However, this may generate very large chunks (multiplying chunk size by the number
201	of iterations), causing issues for larger iterations. It's advisable to apply this
202	function in blocks using 'block_bootstrap'.
203
204	References:
205	Wilks, Daniel S. Statistical methods in the atmospheric sciences. Vol. 100.
206	Academic press, 2011.
207	"""
208	# Rename exclude_dims so they are not bootstrapped
209	if exclude_dims is None:	4✔
210	exclude_dims = [[] for _ in range(len(array_list))]	4✔
211	if not isinstance(exclude_dims, list) or not all(isinstance(x, list) for x in exclude_dims):	4✔
212	raise ValueError("exclude_dims should be a list of lists")	4✔
213	if len(exclude_dims) != len(array_list):	4✔
214	raise ValueError(	4✔
215	"exclude_dims should be a list of the same length as the number of arrays in array_list",
216	)
217	renames = []	4✔
218	for i, (obj, exclude) in enumerate(zip(array_list, exclude_dims)):	4✔
219	new_dim_list = tmp_coord_name(obj, count=len(exclude))	4✔
220	if isinstance(new_dim_list, str):	4✔
221	new_dim_list = [new_dim_list]	4✔
222	rename_dict = {d: f"{new_dim_list[ii]}" for ii, d in enumerate(exclude)}	4✔
223	array_list[i] = obj.rename(rename_dict)	4✔
224	renames.append({v: k for k, v in rename_dict.items()})	4✔
225
226	dim = list(blocks.keys())	4✔
227
228	# Ensure bootstrapped dimensions have consistent sizes across arrays_list
229	for d in blocks.keys():	4✔
230	dim_sizes = [o.sizes[d] for o in array_list if d in o.dims]	4✔
231	if not all(s == dim_sizes[0] for s in dim_sizes):	4✔
232	raise ValueError(f"Block dimension {d} is not the same size on all input arrays")	4✔
233
234	# Get the sizes of the bootstrap dimensions
235	sizes = None	4✔
236	for obj in array_list:	4✔
237	try:	4✔
238	sizes = OrderedDict({d: (obj.sizes[d], b) for d, b in blocks.items()})	4✔
239	break	4✔
240	except KeyError:	4✔
241	pass	4✔
242	if sizes is None:	4✔
243	raise ValueError(	4✔
244	"At least one input array must contain all dimensions in blocks.keys()",
245	)
246
247	# Generate random indices for bootstrapping all arrays_list
248	nested_indices = _n_nested_blocked_random_indices(sizes, n_iteration, circular)	4✔
249
250	# Expand indices for broadcasting for each array separately
251	indices = []	4✔
252	input_core_dims = []	4✔
253	for obj in array_list:	4✔
254	available_dims = [d for d in dim if d in obj.dims]	4✔
255	indices_to_expand = [nested_indices[key] for key in available_dims]	4✔
256
257	indices.append(_expand_n_nested_random_indices(indices_to_expand))	4✔
258	input_core_dims.append(available_dims)	4✔
259
260	# Process arrays_list separately to handle non-matching dimensions
261	result = []	4✔
262	for obj, ind, core_dims in zip(array_list, indices, input_core_dims):	4✔
263	if isinstance(obj, xr.Dataset):	4✔
264	# Assume all variables have the same dtype
265	output_dtype = obj[list(obj.data_vars)[0]].dtype	4✔
266	else:
267	output_dtype = obj.dtype	4✔
268
269	result.append(	4✔
270	xr.apply_ufunc(
271	_bootstrap,
272	obj,
273	kwargs={"indices": [ind]},
274	input_core_dims=[core_dims],
275	output_core_dims=[core_dims + ["iteration"]],
276	dask="parallelized",
277	dask_gufunc_kwargs={"output_sizes": {"iteration": n_iteration}},
278	output_dtypes=[output_dtype],
279	)
280	)
281
282	# Rename excluded dimensions
283	return tuple(res.rename(rename) for res, rename in zip(result, renames))	4✔
284
285
286	def block_bootstrap(	4✔
287	array_list: List[XarrayLike] \| XarrayLike,
288	*, # Enforce keyword-only arguments
289	blocks: Dict[str, int],
290	n_iteration: int,
291	exclude_dims: Union[List[List[str]], None] = None,
292	circular: bool = True,
293	) -> Union[XarrayLike, Tuple[XarrayLike, ...]]:
294	"""
295	Perform block bootstrapping on provided arrays. The function creates new arrays by repeatedly
296	bootstrapping along specified dimensions and stacking the new arrays along a new "iteration"
297	dimension. Additionally, it includes internal functions for chunk size calculation and
298	handling Dask arrays for chunk size limitation.
299
300	Args:
301	array_list: The data to bootstrap, which can be a single xarray object or
302	a list of multiple xarray objects. In the case where
303	multiple datasets are passed, each dataset can have its own set of dimension. However,
304	for successful bootstrapping, dimensions across all input arrays must be nested.
305	For instance, for ``block.keys=['d1', 'd2', 'd3']``, an array with dimension 'd1' and
306	'd2' is valid, but an array with only dimension 'd2' is not valid. All datasets
307	are bootstrapped according to the same random samples along available dimensions.
308	blocks: A dictionary specifying the dimension(s) to bootstrap and the block sizes to
309	use along each dimension: ``{dimension: block_size}``. The keys represent the dimensions
310	to be bootstrapped, and the values indicate the block sizes along each dimension.
311	The dimension provided here should exist in the data provided in ``array_list``.
312	n_iteration: The number of iterations to repeat the bootstrapping process. Determines
313	how many bootstrapped arrays will be generated and stacked along the iteration
314	dimension.
315	exclude_dims: An optional parameter indicating the dimensions to be excluded during
316	bootstrapping for each array provided in ``array_list``. This parameter expects a list
317	of lists, where each inner list corresponds to the dimensions to be excluded for
318	the respective array. By default, the assumption is that no dimensions are
319	excluded, and all arrays are bootstrapped across all specified dimensions in ``blocks``.
320	circular: A boolean flag indicating whether circular block bootstrapping should be
321	performed. Circular bootstrapping means that bootstrapping continues from the beginning
322	when the end of the data is reached. By default, this parameter is set to True.
323
324	Returns:
325	If a single Dataset/DataArray (XarrayLike) is provided, the functions returns a
326	bootstrapped XarrayLike object along the "iteration" dimension. If multiple XarrayLike
327	objects are provided, it returns a tuple of bootstrapped XarrayLike objects, each stacked
328	along the "iteration" dimension.
329
330	Raises:
331	ValueError: If bootstrapped dimensions don't consistent sizes across ``arrays_list``.
332	ValueError: If there is not at least one input array that contains all dimensions in blocks.keys().
333	ValueError: If ``exclude_dims`` is not a list of lists.
334	ValueError: If the list ``exclude_dims`` is not the same length as the number of
335	as ``array_list``.
336
337	References:
338	- Gilleland, E. (2020). Bootstrap Methods for Statistical Inference. Part I:
339	Comparative Forecast Verification for Continuous Variables. Journal of
340	Atmospheric and Oceanic Technology, 37(11), 2117–2134. https://doi.org/10.1175/jtech-d-20-0069.1
341	- Wilks, D. S. (2011). Statistical methods in the atmospheric sciences. Academic press.
342	https://doi.org/10.1016/C2017-0-03921-6
343
344	Examples:
345	Bootstrap a fcst and obs dataset along the time and space dimensions with block sizes of 10
346	for each dimension. The bootstrapping is repeated 1000 times.
347
348	>>> import numpy as np
349	>>> import xarray as xr
350	>>> from scores.processing import block_bootstrap
351	>>> obs = xr.DataArray(np.random.rand(100, 100), dims=["time", "space"])
352	>>> fcst = xr.DataArray(np.random.rand(100, 100), dims=["time", "space"])
353	>>> bootstrapped_obs, bootstrapped_fcst = block_bootstrap(
354	... [obs, fcst],
355	... blocks={"time": 10, "space": 10},
356	... n_iteration=1000,
357	... )
358	"""
359
360	# While the most efficient method involves expanding the iteration dimension withing the
361	# universal function, this approach might generate excessively large chunks (resulting
362	# from multiplying chunk size by iterations) leading to issues with large numbers of
363	# iterations. Hence, here function loops over blocks of iterations to generate the total
364	# number of iterations.
365	def _max_chunk_size_mb(ds):	4✔
366	"""
367	Get the max chunk size in a dataset
368	"""
369	ds = ds if isinstance(ds, xr.Dataset) else ds.to_dataset(name="ds")	×
370
371	chunks = []	×
372	for var in ds.data_vars:	×
373	da = ds[var]	×
374	chunk = da.chunks	×
375	itemsize = da.data.itemsize	×
376	size_of_chunk = itemsize * np.prod([np.max(x) for x in chunk]) / (1024**2)	×
377	chunks.append(size_of_chunk)	×
378	return max(chunks)	×
379
380	if not isinstance(array_list, List):	4✔
381	array_list = [array_list]	4✔
382	# Choose iteration blocks to limit chunk size on dask arrays
383	if array_list[0].chunks: # Note: This is a way to check if the array is backed by a dask.array	4✔
384	# without loading data into memory.
385	# See https://docs.xarray.dev/en/stable/generated/xarray.DataArray.chunks.html
386	ds_max_chunk_size_mb = max(_max_chunk_size_mb(obj) for obj in array_list)	×
387	blocksize = int(MAX_BATCH_SIZE_MB / ds_max_chunk_size_mb)	×
388	blocksize = min(blocksize, n_iteration)	×
389	blocksize = max(blocksize, 1)	×
390	else:
391	blocksize = n_iteration	4✔
392
393	bootstraps = []	4✔
394	for _ in range(blocksize, n_iteration + 1, blocksize):	4✔
395	bootstraps.append(	4✔
396	_block_bootstrap(
397	array_list,
398	blocks=blocks,
399	n_iteration=blocksize,
400	exclude_dims=exclude_dims,
401	circular=circular,
402	)
403	)
404	leftover = n_iteration % blocksize	4✔
405
406	if leftover:	4✔
407	bootstraps.append(	×
408	_block_bootstrap(
409	array_list,
410	blocks=blocks,
411	n_iteration=leftover,
412	exclude_dims=exclude_dims,
413	circular=circular,
414	)
415	)
416
417	bootstraps_concat = tuple(	4✔
418	xr.concat(
419	bootstrap,
420	dim="iteration",
421	coords="minimal",
422	compat="override",
423	)
424	for bootstrap in zip(*bootstraps)
425	)
426
427	if len(array_list) == 1:	4✔
428	return bootstraps_concat[0]	4✔
429	return bootstraps_concat	4✔

nci / scores / 16336054633

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous