• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Ouranosinc / xclim / 15023528003

13 May 2025 10:11PM UTC coverage: 92.362%. First build
15023528003

Pull #2136

github

web-flow
Merge cd7bc3af5 into 4acb15afe
Pull Request #2136: Faster rle

28 of 33 new or added lines in 1 file covered. (84.85%)

7570 of 8196 relevant lines covered (92.36%)

8.09 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

89.86
/src/xclim/indices/run_length.py
1
"""
2
Run-Length Algorithms Submodule
3
===============================
4

5
Computation of statistics on runs of True values in boolean arrays.
6
"""
7

8
from __future__ import annotations
9✔
9

10
from collections.abc import Callable, Sequence
9✔
11
from datetime import datetime
9✔
12
from typing import Literal
9✔
13
from warnings import warn
9✔
14

15
import numpy as np
9✔
16
import pandas as pd
9✔
17
import xarray as xr
9✔
18
from numba import njit
9✔
19

20
from xclim.core import DateStr, DayOfYearStr
9✔
21
from xclim.core.options import OPTIONS, RUN_LENGTH_UFUNC
9✔
22
from xclim.core.utils import get_temp_dimname, split_auxiliary_coordinates, uses_dask
9✔
23
from xclim.indices.helpers import resample_map
9✔
24

25
npts_opt = 9000
9✔
26
"""
9✔
27
Arrays with less than this number of data points per slice will trigger
28
the use of the ufunc version of run lengths algorithms.
29
"""
30

31

32
def use_ufunc(
9✔
33
    ufunc_1dim: bool | str,
34
    da: xr.DataArray,
35
    dim: str = "time",
36
    freq: str | None = None,
37
    index: str = "first",
38
) -> bool:
39
    """
40
    Return whether the ufunc version of run length algorithms should be used with this DataArray or not.
41

42
    If ufunc_1dim is 'from_context', the parameter is read from xclim's global (or context) options.
43
    If it is 'auto', this returns False for dask-backed array and for arrays with more than :py:const:`npts_opt`
44
    points per slice along `dim`.
45

46
    Parameters
47
    ----------
48
    ufunc_1dim : {'from_context', 'auto', True, False}
49
        The method for handling the ufunc parameters.
50
    da : xr.DataArray
51
        Input array.
52
    dim : str
53
        The dimension along which to find runs.
54
    freq : str
55
        Resampling frequency.
56
    index : {'first', 'last'}
57
        If 'first' (default), the run length is indexed with the first element in the run.
58
        If 'last', with the last element in the run.
59

60
    Returns
61
    -------
62
    bool
63
        If ufunc_1dim is "auto", returns True if the array is on dask or too large.
64
        Otherwise, returns ufunc_1dim.
65
    """
66
    if ufunc_1dim is True and freq is not None:
9✔
67
        raise ValueError("Resampling after run length operations is not implemented for 1d method")
×
68

69
    if ufunc_1dim == "from_context":
9✔
70
        ufunc_1dim = OPTIONS[RUN_LENGTH_UFUNC]
9✔
71

72
    if ufunc_1dim == "auto":
9✔
73
        ufunc_1dim = not uses_dask(da) and (da.size // da[dim].size) < npts_opt
9✔
74
    # If resampling after run length is set up for the computation, the 1d method is not implemented
75
    # Unless ufunc_1dim is specifically set to False (in which case we flag an error above),
76
    # we simply forbid this possibility.
77
    return (index == "first") and ufunc_1dim and (freq is None)
9✔
78

79

80
def _is_chunked(da, dim):
9✔
81
    """Check if `da` has non-trivial chunks"""
82
    chunksize = (da.chunksizes.get(dim, (-1,)))[0]
9✔
83
    return chunksize not in (-1, da[dim].size)
9✔
84

85

86
def resample_and_rl(
9✔
87
    da: xr.DataArray,
88
    resample_before_rl: bool,
89
    compute: Callable,
90
    *args,
91
    freq: str,
92
    dim: str = "time",
93
    **kwargs,
94
) -> xr.DataArray:
95
    r"""
96
    Wrap run length algorithms to control if resampling occurs before or after the algorithms.
97

98
    Parameters
99
    ----------
100
    da : xr.DataArray
101
        N-dimensional array (boolean).
102
    resample_before_rl : bool
103
        Determines whether if input arrays of runs `da` should be separated in period before
104
        or after the run length algorithms are applied.
105
    compute : Callable
106
        Run length function to apply.
107
    *args : Any
108
        Positional arguments needed in `compute`.
109
    freq : str
110
        Resampling frequency.
111
    dim : str
112
        The dimension along which to find runs.
113
    **kwargs : Any
114
        Keyword arguments needed in `compute`.
115

116
    Returns
117
    -------
118
    xr.DataArray
119
        Output of compute resampled according to frequency {freq}.
120
    """
121
    if resample_before_rl:
9✔
122
        out = resample_map(
9✔
123
            da,
124
            dim,
125
            freq,
126
            compute,
127
            map_kwargs={"args": args, "freq": None, "dim": dim, **kwargs},
128
        )
129
    else:
130
        out = compute(da, *args, dim=dim, freq=freq, **kwargs)
9✔
131
    return out
9✔
132

133

134
def _smallest_uint(da, dim):
9✔
135
    for dtype in [np.uint8, np.uint16, np.uint32, np.uint64]:
9✔
136
        if np.iinfo(dtype).max > da[dim].size:
9✔
137
            return dtype
9✔
NEW
138
    return np.uint64
×
139

140

141
# Specifying `one` allows in-place multiplication *=
142
@njit
9✔
143
def _cumsum_reset_np(arr, index, one):
9✔
144
    """100110111 -> 100120123"""
145
    # run the cumsum and prod backwards or forward
NEW
146
    it = range(1, arr.shape[-1]) if index == "last" else range(arr.shape[-1] - 2, -1, -1)
×
NEW
147
    for i in it:
×
148
        # this works because we assume to have 1's and 0's
NEW
149
        arr[..., i] *= arr[..., i - it.step] + one
×
NEW
150
    return arr
×
151

152

153
def _cumsum_reset_xr(da, dim, index, reset_on_zero):
9✔
154
    """100110111 -> 100120123"""
155
    # `index="first"` case: Algorithm is applied on inverted array and output is inverted back
156
    if index == "first":
9✔
157
        da = da[{dim: slice(None, None, -1)}]
9✔
158

159
    # Example: da == 100110111 -> cs_s == 100120123
160
    cs = da.cumsum(dim=dim)  # cumulative sum  e.g. 111233456
9✔
161
    cond = da == 0 if reset_on_zero else da.isnull()  # reset condition
9✔
162
    cs2 = cs.where(cond)  # keep only numbers at positions of zeroes e.g. N11NN3NNN (default)
9✔
163
    cs2[{dim: 0}] = 0  # put a zero in front e.g. 011NN3NNN
9✔
164
    cs2 = cs2.ffill(dim=dim)  # e.g. 011113333
9✔
165
    out = cs - cs2
9✔
166
    if index == "first":
9✔
167
        out = out[{dim: slice(None, None, -1)}]
9✔
168
    return out
9✔
169

170

171
def _cumsum_reset(
9✔
172
    da: xr.DataArray,
173
    dim: str = "time",
174
    index: str = "last",
175
    reset_on_zero: bool = True,
176
) -> xr.DataArray:
177
    """
178
    Compute the cumulative sum for each series of numbers separated by zero.
179

180
    Parameters
181
    ----------
182
    da : xr.DataArray
183
        Input array.
184
    dim : str
185
        Dimension name along which the cumulative sum is taken.
186
    index : {'first', 'last'}
187
        If 'first', the largest value of the cumulative sum is indexed with the first element in the run.
188
        If 'last'(default), with the last element in the run.
189
    reset_on_zero : bool
190
        If True, the cumulative sum is reset on each zero value of `da`. Otherwise, the cumulative sum resets
191
        on NaNs. Default is True.
192

193
    Returns
194
    -------
195
    xr.DataArray
196
        An array with cumulative sums.
197
    """
198
    if not _is_chunked(da, dim) and reset_on_zero:
9✔
199
        # only int can be used in this case
200
        typ = _smallest_uint(da, dim)
9✔
201
        out = xr.apply_ufunc(
9✔
202
            _cumsum_reset_np,
203
            da.astype(typ),
204
            input_core_dims=[[dim]],
205
            output_core_dims=[[dim]],
206
            dask="parallelized",
207
            kwargs={"index": index, "one": typ(1)},
208
        ).astype(float)
209
    else:
210
        out = _cumsum_reset_xr(da, dim, index, reset_on_zero)
9✔
211
    return out
9✔
212

213

214
# TODO: Check if rle would be more performant with ffill/bfill instead of two times [{dim: slice(None, None, -1)}]
215
def rle(
9✔
216
    da: xr.DataArray,
217
    dim: str = "time",
218
    index: str = "first",
219
) -> xr.DataArray:
220
    """
221
    Run length.
222

223
    Despite its name, this is not an actual run-length encoder : it returns an array of the same shape
224
    as the input with 0 where the input was <= 0, nan where the input was > 0, except on the first (or last) element
225
    of each run of consecutive > 0 values, where it is set to the sum of the elements within the run.
226
    For an actual run length encoder, see :py:func:`rle_1d`.
227

228
    Usually, the input would be a boolean mask and the first element of each run would then be set to
229
    the run's length (thus the name), but the function also accepts int and float inputs.
230

231
    Parameters
232
    ----------
233
    da : xr.DataArray
234
        Input array.
235
    dim : str
236
        Dimension name.
237
    index : {'first', 'last'}
238
        If 'first' (default), the run length is indexed with the first element in the run.
239
        If 'last', with the last element in the run.
240

241
    Returns
242
    -------
243
    xr.DataArray
244
        The run length array.
245
    """
246
    if da.dtype == bool:
9✔
247
        da = da.astype(int)
9✔
248
    # "first" case: Algorithm is applied on inverted array and output is inverted back
249
    if index == "first":
9✔
250
        da = da[{dim: slice(None, None, -1)}]
9✔
251

252
    # Get cumulative sum for each series of 1, e.g. da == 100110111 -> cs_s == 100120123
253
    cs_s = _cumsum_reset(da, dim)
9✔
254

255
    # Keep total length of each series (and also keep 0's), e.g. 100120123 -> 100N20NN3
256
    # Keep numbers with a 0 to the right and also the last number
257
    # We could keep a -1 instead of nan to save space? Like this we could keep int types.
258
    cs_s = cs_s.where(da.shift({dim: -1}, fill_value=0) == 0)
9✔
259
    out = cs_s.where(da > 0, 0)  # Reinsert 0's at their original place
9✔
260

261
    # Inverting back if needed e.g. 100N20NN3 -> 3NN02N001. This is the output of
262
    # `rle` for 111011001 with index == "first"
263
    if index == "first":
9✔
264
        out = out[{dim: slice(None, None, -1)}]
9✔
265

266
    return out
9✔
267

268

269
def rle_statistics(
9✔
270
    da: xr.DataArray,
271
    reducer: str,
272
    window: int,
273
    dim: str = "time",
274
    freq: str | None = None,
275
    ufunc_1dim: str | bool = "from_context",
276
    index: str = "first",
277
) -> xr.DataArray:
278
    """
279
    Return the length of consecutive run of True values, according to a reducing operator.
280

281
    Parameters
282
    ----------
283
    da : xr.DataArray
284
        N-dimensional array (boolean).
285
    reducer : str
286
        Name of the reducing function.
287
    window : int
288
        Minimal length of consecutive runs to be included in the statistics.
289
    dim : str
290
        Dimension along which to calculate consecutive run; Default: 'time'.
291
    freq : str
292
        Resampling frequency.
293
    ufunc_1dim : Union[str, bool]
294
        Use the 1d 'ufunc' version of this function : default (auto) will attempt to select optimal
295
        usage based on number of data points.  Using 1D_ufunc=True is typically more efficient
296
        for DataArray with a small number of grid points.
297
        It can be modified globally through the "run_length_ufunc" global option.
298
    index : {'first', 'last'}
299
        If 'first' (default), the run length is indexed with the first element in the run.
300
        If 'last', with the last element in the run.
301

302
    Returns
303
    -------
304
    xr.DataArray, [int]
305
        Length of runs of True values along dimension, according to the reducing function (float)
306
        If there are no runs (but the data is valid), returns 0.
307
    """
308
    ufunc_1dim = use_ufunc(ufunc_1dim, da, dim=dim, index=index, freq=freq)
9✔
309
    if ufunc_1dim:
9✔
310
        rl_stat = statistics_run_ufunc(da, reducer, window, dim)
9✔
311
    else:
312
        d = rle(da, dim=dim, index=index)
9✔
313

314
        def get_rl_stat(d):
9✔
315
            rl_stat = getattr(d.where(d >= window), reducer)(dim=dim)
9✔
316
            rl_stat = xr.where((d.isnull() | (d < window)).all(dim=dim), 0, rl_stat)
9✔
317
            return rl_stat
9✔
318

319
        if freq is None:
9✔
320
            rl_stat = get_rl_stat(d)
9✔
321
        else:
322
            rl_stat = resample_map(d, dim, freq, get_rl_stat)
9✔
323
    return rl_stat
9✔
324

325

326
def longest_run(
9✔
327
    da: xr.DataArray,
328
    dim: str = "time",
329
    freq: str | None = None,
330
    ufunc_1dim: str | bool = "from_context",
331
    index: str = "first",
332
) -> xr.DataArray:
333
    """
334
    Return the length of the longest consecutive run of True values.
335

336
    Parameters
337
    ----------
338
    da : xr.DataArray
339
        N-dimensional array (boolean).
340
    dim : str
341
        Dimension along which to calculate consecutive run; Default: 'time'.
342
    freq : str
343
        Resampling frequency.
344
    ufunc_1dim : Union[str, bool]
345
        Use the 1d 'ufunc' version of this function : default (auto) will attempt to select optimal
346
        usage based on number of data points.  Using 1D_ufunc=True is typically more efficient
347
        for DataArray with a small number of grid points.
348
        It can be modified globally through the "run_length_ufunc" global option.
349
    index : {'first', 'last'}
350
        If 'first', the run length is indexed with the first element in the run.
351
        If 'last', with the last element in the run.
352

353
    Returns
354
    -------
355
    xr.DataArray, [int]
356
        Length of the longest run of True values along dimension (int).
357
    """
358
    return rle_statistics(
9✔
359
        da,
360
        reducer="max",
361
        window=1,
362
        dim=dim,
363
        freq=freq,
364
        ufunc_1dim=ufunc_1dim,
365
        index=index,
366
    )
367

368

369
def windowed_run_events(
9✔
370
    da: xr.DataArray,
371
    window: int,
372
    dim: str = "time",
373
    freq: str | None = None,
374
    ufunc_1dim: str | bool = "from_context",
375
    index: str = "first",
376
) -> xr.DataArray:
377
    """
378
    Return the number of runs of a minimum length.
379

380
    Parameters
381
    ----------
382
    da : xr.DataArray
383
        Input N-dimensional DataArray (boolean).
384
    window : int
385
        Minimum run length.
386
        When equal to 1, an optimized version of the algorithm is used.
387
    dim : str
388
        Dimension along which to calculate consecutive run (default: 'time').
389
    freq : str
390
        Resampling frequency.
391
    ufunc_1dim : Union[str, bool]
392
        Use the 1d 'ufunc' version of this function : default (auto) will attempt to select optimal
393
        usage based on number of data points.  Using 1D_ufunc=True is typically more efficient
394
        for DataArray with a small number of grid points.
395
        Ignored when `window=1`. It can be modified globally through the "run_length_ufunc" global option.
396
    index : {'first', 'last'}
397
        If 'first', the run length is indexed with the first element in the run.
398
        If 'last', with the last element in the run.
399

400
    Returns
401
    -------
402
    xr.DataArray, [int]
403
        Number of distinct runs of a minimum length (int).
404
    """
405
    ufunc_1dim = use_ufunc(ufunc_1dim, da, dim=dim, index=index, freq=freq)
9✔
406

407
    if ufunc_1dim:
9✔
408
        out = windowed_run_events_ufunc(da, window, dim)
9✔
409

410
    else:
411
        if window == 1:
9✔
412
            shift = 1 * (index == "first") + -1 * (index == "last")
×
413
            d = xr.where(da.shift({dim: shift}, fill_value=0) == 0, 1, 0)
×
414
            d = d.where(da == 1, 0)
×
415
        else:
416
            d = rle(da, dim=dim, index=index)
9✔
417
            d = xr.where(d >= window, 1, 0)
9✔
418
        if freq is not None:
9✔
419
            d = d.resample({dim: freq})
9✔
420
        out = d.sum(dim=dim)
9✔
421

422
    return out
9✔
423

424

425
def windowed_run_count(
9✔
426
    da: xr.DataArray,
427
    window: int,
428
    dim: str = "time",
429
    freq: str | None = None,
430
    ufunc_1dim: str | bool = "from_context",
431
    index: str = "first",
432
) -> xr.DataArray:
433
    """
434
    Return the number of consecutive true values in array for runs at least as long as given duration.
435

436
    Parameters
437
    ----------
438
    da : xr.DataArray
439
        Input N-dimensional DataArray (boolean).
440
    window : int
441
        Minimum run length.
442
        When equal to 1, an optimized version of the algorithm is used.
443
    dim : str
444
        Dimension along which to calculate consecutive run (default: 'time').
445
    freq : str
446
        Resampling frequency.
447
    ufunc_1dim : Union[str, bool]
448
        Use the 1d 'ufunc' version of this function : default (auto) will attempt to select optimal
449
        usage based on number of data points. Using 1D_ufunc=True is typically more efficient
450
        for DataArray with a small number of grid points.
451
        Ignored when `window=1`. It can be modified globally through the "run_length_ufunc" global option.
452
    index : {'first', 'last'}
453
        If 'first', the run length is indexed with the first element in the run.
454
        If 'last', with the last element in the run.
455

456
    Returns
457
    -------
458
    xr.DataArray, [int]
459
        Total number of `True` values part of a consecutive runs of at least `window` long.
460
    """
461
    ufunc_1dim = use_ufunc(ufunc_1dim, da, dim=dim, index=index, freq=freq)
9✔
462

463
    if ufunc_1dim:
9✔
464
        out = windowed_run_count_ufunc(da, window, dim)
9✔
465

466
    elif window == 1 and freq is None:
9✔
467
        out = da.sum(dim=dim)
×
468

469
    else:
470
        d = rle(da, dim=dim, index=index)
9✔
471
        d = d.where(d >= window, 0)
9✔
472
        if freq is not None:
9✔
473
            d = d.resample({dim: freq})
×
474
        out = d.sum(dim=dim)
9✔
475

476
    return out
9✔
477

478

479
def windowed_max_run_sum(
9✔
480
    da: xr.DataArray,
481
    window: int,
482
    dim: str = "time",
483
    freq: str | None = None,
484
    index: str = "first",
485
) -> xr.DataArray:
486
    """
487
    Return the maximum sum of consecutive float values for runs at least as long as the given window length.
488

489
    Parameters
490
    ----------
491
    da : xr.DataArray
492
        Input N-dimensional DataArray (boolean).
493
    window : int
494
        Minimum run length.
495
        When equal to 1, an optimized version of the algorithm is used.
496
    dim : str
497
        Dimension along which to calculate consecutive run (default: 'time').
498
    freq : str
499
        Resampling frequency.
500
    index : {'first', 'last'}
501
        If 'first', the run length is indexed with the first element in the run.
502
        If 'last', with the last element in the run.
503

504
    Returns
505
    -------
506
    xr.DataArray, [int]
507
        Total number of `True` values part of a consecutive runs of at least `window` long.
508
    """
509
    if window == 1 and freq is None:
9✔
510
        # using _cumsum_reset_xr instead of rle to be able to handle a float
511
        out = _cumsum_reset_xr(da, dim=dim, index=index, reset_on_zero=True).max(dim=dim)
9✔
512

513
    else:
514
        # using _cumsum_reset_xr instead of rle to be able to handle a float
515
        d_rse = _cumsum_reset_xr(da, dim=dim, index=index, reset_on_zero=True)
9✔
516
        d_rle = rle(da > 0, dim=dim, index=index)
9✔
517

518
        d = d_rse.where(d_rle >= window, 0)
9✔
519
        if freq is not None:
9✔
520
            d = d.resample({dim: freq})
×
521
        out = d.max(dim=dim)
9✔
522

523
    return out
9✔
524

525

526
def _boundary_run(
9✔
527
    da: xr.DataArray,
528
    window: int,
529
    dim: str,
530
    freq: str | None,
531
    coord: str | bool | None,
532
    ufunc_1dim: str | bool,
533
    position: str,
534
) -> xr.DataArray:
535
    """
536
    Return the index of the first item of the first or last run of at least a given length.
537

538
    Parameters
539
    ----------
540
    da : xr.DataArray
541
        Input N-dimensional DataArray (boolean).
542
    window : int
543
        Minimum duration of consecutive run to accumulate values.
544
        When equal to 1, an optimized version of the algorithm is used.
545
    dim : str
546
        Dimension along which to calculate consecutive run.
547
    freq : str
548
        Resampling frequency.
549
    coord : Optional[str]
550
        If not False, the function returns values along `dim` instead of indexes.
551
        If `dim` has a datetime dtype, `coord` can also be a str of the name of the
552
        DateTimeAccessor object to use (ex: 'dayofyear').
553
    ufunc_1dim : Union[str, bool]
554
        Use the 1d 'ufunc' version of this function : default (auto) will attempt to select optimal
555
        usage based on number of data points.  Using 1D_ufunc=True is typically more efficient
556
        for DataArray with a small number of grid points.
557
        Ignored when `window=1`. It can be modified globally through the "run_length_ufunc" global option.
558
    position : {"first", "last"}
559
        Determines if the algorithm finds the "first" or "last" run
560

561
    Returns
562
    -------
563
    xr.DataArray
564
        Index (or coordinate if `coord` is not False) of first item in first (last) valid run.
565
        Returns np.nan if there are no valid runs.
566
    """
567

568
    # FIXME: The logic here should not use outside scope variables, but rather pass them as arguments.
569
    def coord_transform(out, da):
9✔
570
        """Transforms indexes to coordinates if needed, and drops obsolete dim."""
571
        if coord:
9✔
572
            crd = da[dim]
9✔
573
            if isinstance(coord, str):
9✔
574
                crd = getattr(crd.dt, coord)
9✔
575
            out = lazy_indexing(crd, out)
9✔
576

577
        if dim in out.coords:
9✔
578
            out = out.drop_vars(dim)
×
579
        return out
9✔
580

581
    # FIXME: The logic here should not use outside scope variables, but rather pass them as arguments.
582
    # general method to get indices (or coords) of first run
583
    def find_boundary_run(runs, position):
9✔
584
        if position == "last":
9✔
585
            runs = runs[{dim: slice(None, None, -1)}]
9✔
586
        dmax_ind = runs.argmax(dim=dim)
9✔
587
        # If there are no runs, dmax_ind will be 0: We must replace this with NaN
588
        out = dmax_ind.where(dmax_ind != runs.argmin(dim=dim))
9✔
589
        if position == "last":
9✔
590
            out = runs[dim].size - out - 1
9✔
591
            runs = runs[{dim: slice(None, None, -1)}]
9✔
592
        out = coord_transform(out, runs)
9✔
593
        return out
9✔
594

595
    ufunc_1dim = use_ufunc(ufunc_1dim, da, dim=dim, freq=freq)
9✔
596

597
    da = da.fillna(0)  # We expect a boolean array, but there could be NaNs nonetheless
9✔
598
    if window == 1:
9✔
599
        if freq is not None:
9✔
600
            out = resample_map(da, dim, freq, find_boundary_run, map_kwargs={"position": position})
9✔
601
        else:
602
            out = find_boundary_run(da, position)
9✔
603

604
    elif ufunc_1dim:
9✔
605
        if position == "last":
9✔
606
            da = da[{dim: slice(None, None, -1)}]
9✔
607
        out = first_run_ufunc(x=da, window=window, dim=dim)
9✔
608
        if position == "last" and not coord:
9✔
609
            out = da[dim].size - out - 1
×
610
            da = da[{dim: slice(None, None, -1)}]
×
611
        out = coord_transform(out, da)
9✔
612

613
    else:
614
        # _cusum_reset is an intermediate step in rle, which is sufficient here
615
        d = _cumsum_reset(da, dim=dim, index=position)
9✔
616
        d = xr.where(d >= window, 1, 0)
9✔
617
        # for "first" run, return "first" element in the run (and conversely for "last" run)
618
        if freq is not None:
9✔
619
            out = resample_map(d, dim, freq, find_boundary_run, map_kwargs={"position": position})
×
620
        else:
621
            out = find_boundary_run(d, position)
9✔
622

623
    return out
9✔
624

625

626
def first_run(
9✔
627
    da: xr.DataArray,
628
    window: int,
629
    dim: str = "time",
630
    freq: str | None = None,
631
    coord: str | bool | None = False,
632
    ufunc_1dim: str | bool = "from_context",
633
) -> xr.DataArray:
634
    """
635
    Return the index of the first item of the first run of at least a given length.
636

637
    Parameters
638
    ----------
639
    da : xr.DataArray
640
        Input N-dimensional DataArray (boolean).
641
    window : int
642
        Minimum duration of consecutive run to accumulate values.
643
        When equal to 1, an optimized version of the algorithm is used.
644
    dim : str
645
        Dimension along which to calculate consecutive run (default: 'time').
646
    freq : str
647
        Resampling frequency.
648
    coord : Optional[str]
649
        If not False, the function returns values along `dim` instead of indexes.
650
        If `dim` has a datetime dtype, `coord` can also be a str of the name of the
651
        DateTimeAccessor object to use (ex: 'dayofyear').
652
    ufunc_1dim : Union[str, bool]
653
        Use the 1d 'ufunc' version of this function : default (auto) will attempt to select optimal
654
        usage based on number of data points.  Using 1D_ufunc=True is typically more efficient
655
        for DataArray with a small number of grid points.
656
        Ignored when `window=1`. It can be modified globally through the "run_length_ufunc" global option.
657

658
    Returns
659
    -------
660
    xr.DataArray
661
        Index (or coordinate if `coord` is not False) of first item in first valid run.
662
        Returns np.nan if there are no valid runs.
663
    """
664
    out = _boundary_run(
9✔
665
        da,
666
        window=window,
667
        dim=dim,
668
        freq=freq,
669
        coord=coord,
670
        ufunc_1dim=ufunc_1dim,
671
        position="first",
672
    )
673
    return out
9✔
674

675

676
def last_run(
9✔
677
    da: xr.DataArray,
678
    window: int,
679
    dim: str = "time",
680
    freq: str | None = None,
681
    coord: str | bool | None = False,
682
    ufunc_1dim: str | bool = "from_context",
683
) -> xr.DataArray:
684
    """
685
    Return the index of the last item of the last run of at least a given length.
686

687
    Parameters
688
    ----------
689
    da : xr.DataArray
690
        Input N-dimensional DataArray (boolean).
691
    window : int
692
        Minimum duration of consecutive run to accumulate values.
693
        When equal to 1, an optimized version of the algorithm is used.
694
    dim : str
695
        Dimension along which to calculate consecutive run (default: 'time').
696
    freq : str
697
        Resampling frequency.
698
    coord : Optional[str]
699
        If not False, the function returns values along `dim` instead of indexes.
700
        If `dim` has a datetime dtype, `coord` can also be a str of the name of the
701
        DateTimeAccessor object to use (ex: 'dayofyear').
702
    ufunc_1dim : Union[str, bool]
703
        Use the 1d 'ufunc' version of this function : default (auto) will attempt to select optimal
704
        usage based on number of data points.  Using `1D_ufunc=True` is typically more efficient
705
        for a DataArray with a small number of grid points.
706
        Ignored when `window=1`. It can be modified globally through the "run_length_ufunc" global option.
707

708
    Returns
709
    -------
710
    xr.DataArray
711
        Index (or coordinate if `coord` is not False) of last item in last valid run.
712
        Returns np.nan if there are no valid runs.
713
    """
714
    out = _boundary_run(
9✔
715
        da,
716
        window=window,
717
        dim=dim,
718
        freq=freq,
719
        coord=coord,
720
        ufunc_1dim=ufunc_1dim,
721
        position="last",
722
    )
723
    return out
9✔
724

725

726
# TODO: Add window arg
727
# TODO: Inverse window arg to tolerate holes?
728
def run_bounds(mask: xr.DataArray, dim: str = "time", coord: bool | str = True):
9✔
729
    """
730
    Return the start and end dates of boolean runs along a dimension.
731

732
    Parameters
733
    ----------
734
    mask : xr.DataArray
735
        Boolean array.
736
    dim : str
737
        Dimension along which to look for runs.
738
    coord : bool or str
739
        If `True`, return values of the coordinate, if a string, returns values from `dim.dt.<coord>`.
740
        If `False`, return indexes.
741

742
    Returns
743
    -------
744
    xr.DataArray
745
        With ``dim`` reduced to "events" and "bounds". The events dim is as long as needed, padded with NaN or NaT.
746
    """
747
    if uses_dask(mask):
9✔
748
        raise NotImplementedError("Dask arrays not supported as we can't know the final event number before computing.")
×
749

750
    diff = xr.concat((mask.isel({dim: [0]}).astype(int), mask.astype(int).diff(dim)), dim)
9✔
751

752
    nstarts = (diff == 1).sum(dim).max().item()
9✔
753

754
    def _get_indices(arr, *, N):
9✔
755
        out = np.full((N,), np.nan, dtype=float)
9✔
756
        inds = np.where(arr)[0]
9✔
757
        out[: len(inds)] = inds
9✔
758
        return out
9✔
759

760
    starts = xr.apply_ufunc(
9✔
761
        _get_indices,
762
        diff == 1,
763
        input_core_dims=[[dim]],
764
        output_core_dims=[["events"]],
765
        kwargs={"N": nstarts},
766
        vectorize=True,
767
    )
768

769
    ends = xr.apply_ufunc(
9✔
770
        _get_indices,
771
        diff == -1,
772
        input_core_dims=[[dim]],
773
        output_core_dims=[["events"]],
774
        kwargs={"N": nstarts},
775
        vectorize=True,
776
    )
777

778
    if coord:
9✔
779
        crd = mask[dim]
9✔
780
        if isinstance(coord, str):
9✔
781
            crd = getattr(crd.dt, coord)
9✔
782

783
        starts = lazy_indexing(crd, starts)
9✔
784
        ends = lazy_indexing(crd, ends)
9✔
785
    return xr.concat((starts, ends), "bounds")
9✔
786

787

788
def keep_longest_run(da: xr.DataArray, dim: str = "time", freq: str | None = None) -> xr.DataArray:
9✔
789
    """
790
    Keep the longest run along a dimension.
791

792
    Parameters
793
    ----------
794
    da : xr.DataArray
795
        Boolean array.
796
    dim : str
797
        Dimension along which to check for the longest run.
798
    freq : str
799
        Resampling frequency.
800

801
    Returns
802
    -------
803
    xr.DataArray, [bool]
804
        Boolean array similar to da but with only one run, the (first) longest.
805
    """
806
    # Get run lengths
807
    rls = rle(da, dim)
9✔
808

809
    def _get_out(_rls):  # numpydoc ignore=GL08
9✔
810
        _out = xr.where(
9✔
811
            # Construct an integer array and find the max
812
            _rls[dim].copy(data=np.arange(_rls[dim].size)) == _rls.argmax(dim),
813
            _rls + 1,  # Add one to the First longest run
814
            _rls,
815
        )
816
        _out = _out.ffill(dim) == _out.max(dim)
9✔
817
        return _out
9✔
818

819
    if freq is not None:
9✔
820
        out = resample_map(rls, dim, freq, _get_out)
×
821
    else:
822
        out = _get_out(rls)
9✔
823

824
    return da.copy(data=out.transpose(*da.dims).data)
9✔
825

826

827
def runs_with_holes(
9✔
828
    da_start: xr.DataArray,
829
    window_start: int,
830
    da_stop: xr.DataArray,
831
    window_stop: int,
832
    dim: str = "time",
833
) -> xr.DataArray:
834
    """
835
    Extract events, i.e. runs whose starting and stopping points are defined through run length conditions.
836

837
    Parameters
838
    ----------
839
    da_start : xr.DataArray
840
        Input array where run sequences are searched to define the start points in the main runs.
841
    window_start : int
842
        Number of True (1) values needed to start a run in `da_start`.
843
    da_stop : xr.DataArray
844
        Input array where run sequences are searched to define the stop points in the main runs.
845
    window_stop : int
846
        Number of True (1) values needed to start a run in `da_stop`.
847
    dim : str
848
        Dimension name.
849

850
    Returns
851
    -------
852
    xr.DataArray
853
        Output array with 1's when in a run sequence and with 0's elsewhere.
854

855
    Notes
856
    -----
857
    A season (as defined in ``season``) could be considered as an event with ``window_stop == window_start``
858
    and ``da_stop == 1 - da_start``, although it has more constraints on when to start and stop a run through
859
    the `date` argument and only one season can be found.
860
    """
861
    da_start = da_start.astype(int).fillna(0)
9✔
862
    da_stop = da_stop.astype(int).fillna(0)
9✔
863

864
    start_runs = _cumsum_reset(da_start, dim=dim, index="first")
9✔
865
    stop_runs = _cumsum_reset(da_stop, dim=dim, index="first")
9✔
866
    start_positions = xr.where(start_runs >= window_start, 1, np.nan)
9✔
867
    stop_positions = xr.where(stop_runs >= window_stop, 0, np.nan)
9✔
868

869
    # start positions (1) are f-filled until a stop position (0) is met
870
    runs = stop_positions.combine_first(start_positions).ffill(dim=dim).fillna(0)
9✔
871
    return runs
9✔
872

873

874
def season_start(
9✔
875
    da: xr.DataArray,
876
    window: int,
877
    mid_date: DayOfYearStr | None = None,
878
    dim: str = "time",
879
    coord: str | bool | None = False,
880
) -> xr.DataArray:
881
    """
882
    Start of a season.
883

884
    See :py:func:`season`.
885

886
    Parameters
887
    ----------
888
    da : xr.DataArray
889
        Input N-dimensional DataArray (boolean).
890
    window : int
891
        Minimum duration of consecutive values to start and end the season.
892
    mid_date : DayOfYearStr, optional
893
        The date (in MM-DD format) that a season must include to be considered valid.
894
    dim : str
895
        Dimension along which to calculate the season (default: 'time').
896
    coord : Optional[str]
897
        If not False, the function returns values along `dim` instead of indexes.
898
        If `dim` has a datetime dtype, `coord` can also be a str of the name of the
899
        DateTimeAccessor object to use (ex: 'dayofyear').
900

901
    Returns
902
    -------
903
    xr.DataArray
904
        Start of the season, units depend on `coord`.
905

906
    See Also
907
    --------
908
    season : Calculate the bounds of a season along a dimension.
909
    season_end : End of a season.
910
    season_length : Length of a season.
911
    """
912
    return first_run_before_date(da, window=window, date=mid_date, dim=dim, coord=coord)
9✔
913

914

915
def season_end(
9✔
916
    da: xr.DataArray,
917
    window: int,
918
    mid_date: DayOfYearStr | None = None,
919
    dim: str = "time",
920
    coord: str | bool | None = False,
921
    _beg: xr.DataArray | None = None,
922
) -> xr.DataArray:
923
    """
924
    End of a season.
925

926
    See :py:func:`season`. Similar to :py:func:`first_run_after_date` but here a season
927
    must have a start for an end to be valid. Also, if no end is found but a start was found
928
    the end is set to the last element of the series.
929

930
    Parameters
931
    ----------
932
    da : xr.DataArray
933
        Input N-dimensional DataArray (boolean).
934
    window : int
935
        Minimum duration of consecutive values to start and end the season.
936
    mid_date : DayOfYearStr, optional
937
        The date (in MM-DD format) that a run must include to be considered valid.
938
    dim : str
939
        Dimension along which to calculate consecutive run (default: 'time').
940
    coord : str, optional
941
        If not False, the function returns values along `dim` instead of indexes.
942
        If `dim` has a datetime dtype, `coord` can also be a str of the name of the
943
        DateTimeAccessor object to use (ex: 'dayofyear').
944
    _beg : xr.DataArray, optional
945
        If given, the start of the season. This is used to avoid recomputing the start.
946

947
    Returns
948
    -------
949
    xr.DataArray
950
        End of the season, units depend on `coord`.
951
        If there is a start is found but no end, the end is set to the last element.
952

953
    See Also
954
    --------
955
    season : Calculate the bounds of a season along a dimension.
956
    season_start : Start of a season.
957
    season_length : Length of a season.
958
    """
959
    # Fast path for `season` and `season_length`
960
    if _beg is not None:
9✔
961
        beg = _beg
9✔
962
    else:
963
        beg = season_start(da, window=window, dim=dim, mid_date=mid_date, coord=False)
9✔
964
    # Invert the condition and mask all values after beginning
965
    # we fillna(0) as so to differentiate series with no runs and all-nan series
966
    not_da = (~da).where(da[dim].copy(data=np.arange(da[dim].size)) >= beg.fillna(0))
9✔
967
    end = first_run_after_date(not_da, window=window, dim=dim, date=mid_date, coord=False)
9✔
968
    if _beg is None:
9✔
969
        # Where end is null and beg is not null (valid data, no end detected), put the last index
970
        # Don't do this in the fast path, so that the length can use this information
971
        end = xr.where(end.isnull() & beg.notnull(), da[dim].size - 1, end)
9✔
972
        end = end.where(beg.notnull())
9✔
973
    if coord:
9✔
974
        crd = da[dim]
9✔
975
        if isinstance(coord, str):
9✔
976
            crd = getattr(crd.dt, coord)
9✔
977
        end = lazy_indexing(crd, end)
9✔
978
    return end
9✔
979

980

981
def season(
9✔
982
    da: xr.DataArray,
983
    window: int,
984
    mid_date: DayOfYearStr | None = None,
985
    dim: str = "time",
986
    stat: str | None = None,
987
    coord: str | bool | None = False,
988
) -> xr.Dataset | xr.DataArray:
989
    """
990
    Calculate the bounds of a season along a dimension.
991

992
    A "season" is a run of True values that may include breaks under a given length (`window`).
993
    The start is computed as the first run of `window` True values, and the end as the first subsequent run
994
    of  `window` False values. The end element is the first element after the season.
995
    If a date is given, it must be included in the season
996
    (i.e. the start cannot occur later and the end cannot occur earlier).
997

998
    Parameters
999
    ----------
1000
    da : xr.DataArray
1001
        Input N-dimensional DataArray (boolean).
1002
    window : int
1003
        Minimum duration of consecutive values to start and end the season.
1004
    mid_date : DayOfYearStr, optional
1005
        The date (in MM-DD format) that a run must include to be considered valid.
1006
    dim : str
1007
        Dimension along which to calculate consecutive run (default: 'time').
1008
    stat : str, optional
1009
        Not currently implemented.
1010
        If not None, return a statistic of the season. The statistic is calculated on the season's values.
1011
    coord : Optional[str]
1012
        If not False, the function returns values along `dim` instead of indexes.
1013
        If `dim` has a datetime dtype, `coord` can also be a str of the name of the
1014
        DateTimeAccessor object to use (ex: 'dayofyear').
1015

1016
    Returns
1017
    -------
1018
    xr.Dataset
1019
        The Dataset variables:
1020
            start : start of the season (index or units depending on ``coord``)
1021
            end : end of the season (index or units depending on ``coord``)
1022
            length : length of the season (in number of elements along ``dim``)
1023

1024
    See Also
1025
    --------
1026
    season_start : Start of a season.
1027
    season_end : End of a season.
1028
    season_length : Length of a season.
1029

1030
    Notes
1031
    -----
1032
    The run can include holes of False or NaN values, so long as they do not exceed the window size.
1033

1034
    If a date is given, the season start and end are forced to be on each side of this date. This means that
1035
    even if the "real" season has been over for a long time, this is the date used in the length calculation.
1036
    e.g. Length of the "warm season", where T > 25°C, with date = 1st August. Let's say the temperature is over
1037
    25 for all June, but July and august have very cold temperatures. Instead of returning 30 days (June),
1038
    the function will return 61 days (July + June).
1039

1040
    The season's length is always the difference between the end and the start. Except if no season end was
1041
    found before the end of the data. In that case the end is set to last element and the length is set to
1042
    the data size minus the start index. Thus, for the specific case, :math:`length = end - start + 1`,
1043
    because the end falls on the last element of the season instead of the subsequent one.
1044
    """
1045
    beg = season_start(da, window=window, dim=dim, mid_date=mid_date, coord=False)
9✔
1046
    # Use fast path in season_end : no recomputing of start,
1047
    # no masking of end where beg.isnull(), and don't set end if none found
1048
    end = season_end(da, window=window, dim=dim, mid_date=mid_date, _beg=beg, coord=False)
9✔
1049
    # Three cases :
1050
    #           start     no start
1051
    # end       e - s        0
1052
    # no end   size - s      0
1053
    # da is boolean, so we have no way of knowing if the absence of season
1054
    # is due to missing values or to an actual absence.
1055
    length = xr.where(
9✔
1056
        beg.isnull(),
1057
        0,
1058
        # Where there is no end, from the start to the boundary
1059
        xr.where(end.isnull(), da[dim].size - beg, end - beg),
1060
    )
1061
    # Now masks ends
1062
    # Still give an end if we didn't find any : put the last element
1063
    # This breaks the length = end - beg, but yields a truer length
1064
    end = xr.where(end.isnull() & beg.notnull(), da[dim].size - 1, end)
9✔
1065
    end = end.where(beg.notnull())
9✔
1066

1067
    if coord:
9✔
1068
        crd = da[dim]
×
1069
        if isinstance(coord, str):
×
1070
            crd = getattr(crd.dt, coord)
×
1071
            coordstr = coord
×
1072
        else:
1073
            coordstr = dim
×
1074
        beg = lazy_indexing(crd, beg)
×
1075
        end = lazy_indexing(crd, end)
×
1076
    else:
1077
        coordstr = "index"
9✔
1078

1079
    out = xr.Dataset({"start": beg, "end": end, "length": length})
9✔
1080
    out.start.attrs.update(
9✔
1081
        long_name="Start of the season.",
1082
        description=f"First {coordstr} of a run of at least {window} steps respecting the condition.",
1083
    )
1084
    out.end.attrs.update(
9✔
1085
        long_name="End of the season.",
1086
        description=f"First {coordstr} of a run of at least {window} "
1087
        "steps breaking the condition, starting after `start`.",
1088
    )
1089
    out.length.attrs.update(
9✔
1090
        long_name="Length of the season.",
1091
        description="Number of steps of the original series in the season, between 'start' and 'end'.",
1092
    )
1093
    return out
9✔
1094

1095

1096
def season_length(
9✔
1097
    da: xr.DataArray,
1098
    window: int,
1099
    mid_date: DayOfYearStr | None = None,
1100
    dim: str = "time",
1101
) -> xr.DataArray:
1102
    """
1103
    Length of a season.
1104

1105
    Parameters
1106
    ----------
1107
    da : xr.DataArray
1108
        Input N-dimensional DataArray (boolean).
1109
    window : int
1110
        Minimum duration of consecutive values to start and end the season.
1111
    mid_date : DayOfYearStr, optional
1112
        The date (in MM-DD format) that a run must include to be considered valid.
1113
    dim : str
1114
        Dimension along which to calculate consecutive run (default: 'time').
1115

1116
    Returns
1117
    -------
1118
    xr.DataArray, [int]
1119
        Length of the season, in number of elements along dimension `time`.
1120

1121
    See Also
1122
    --------
1123
    season : Calculate the bounds of a season along a dimension.
1124
    season_start : Start of a season.
1125
    season_end : End of a season.
1126
    """
1127
    seas = season(da, window, mid_date, dim, coord=False)
9✔
1128
    return seas.length
9✔
1129

1130

1131
def run_end_after_date(
9✔
1132
    da: xr.DataArray,
1133
    window: int,
1134
    date: DayOfYearStr = "07-01",
1135
    dim: str = "time",
1136
    coord: bool | str | None = "dayofyear",
1137
) -> xr.DataArray:
1138
    """
1139
    Return the index of the first item after the end of a run after a given date.
1140

1141
    The run must begin before the date.
1142

1143
    Parameters
1144
    ----------
1145
    da : xr.DataArray
1146
        Input N-dimensional DataArray (boolean).
1147
    window : int
1148
        Minimum duration of consecutive run to accumulate values.
1149
    date : str
1150
        The date after which to look for the end of a run.
1151
    dim : str
1152
        Dimension along which to calculate consecutive run (default: 'time').
1153
    coord : Optional[Union[bool, str]]
1154
        If not False, the function returns values along `dim` instead of indexes.
1155
        If `dim` has a datetime dtype, `coord` can also be a str of the name of the
1156
        DateTimeAccessor object to use (ex: 'dayofyear').
1157

1158
    Returns
1159
    -------
1160
    xr.DataArray
1161
        Index (or coordinate if `coord` is not False) of last item in last valid run.
1162
        Returns np.nan if there are no valid runs.
1163
    """
1164
    mid_idx = index_of_date(da[dim], date, max_idxs=1, default=0)
9✔
1165
    if mid_idx.size == 0:  # The date is not within the group. Happens at boundaries.
9✔
1166
        return xr.full_like(da.isel({dim: 0}), np.nan, float).drop_vars(dim)
9✔
1167

1168
    end = first_run(
9✔
1169
        (~da).where(da[dim] >= da[dim][mid_idx][0]),
1170
        window=window,
1171
        dim=dim,
1172
        coord=coord,
1173
    )
1174
    beg = first_run(da.where(da[dim] < da[dim][mid_idx][0]), window=window, dim=dim)
9✔
1175

1176
    if coord:
9✔
1177
        last = da[dim][-1]
9✔
1178
        if isinstance(coord, str):
9✔
1179
            last = getattr(last.dt, coord)
9✔
1180
    else:
1181
        last = da[dim].size - 1
9✔
1182

1183
    end = xr.where(end.isnull() & beg.notnull(), last, end)
9✔
1184
    return end.where(beg.notnull()).drop_vars(dim, errors="ignore")
9✔
1185

1186

1187
def first_run_after_date(
9✔
1188
    da: xr.DataArray,
1189
    window: int,
1190
    date: DayOfYearStr | None = "07-01",
1191
    dim: str = "time",
1192
    coord: bool | str | None = "dayofyear",
1193
) -> xr.DataArray:
1194
    """
1195
    Return the index of the first item of the first run after a given date.
1196

1197
    Parameters
1198
    ----------
1199
    da : xr.DataArray
1200
        Input N-dimensional DataArray (boolean).
1201
    window : int
1202
        Minimum duration of consecutive run to accumulate values.
1203
    date : DayOfYearStr
1204
        The date after which to look for the run.
1205
    dim : str
1206
        Dimension along which to calculate consecutive run (default: 'time').
1207
    coord : Optional[Union[bool, str]]
1208
        If not False, the function returns values along `dim` instead of indexes.
1209
        If `dim` has a datetime dtype, `coord` can also be a str of the name of the
1210
        DateTimeAccessor object to use (ex: 'dayofyear').
1211

1212
    Returns
1213
    -------
1214
    xr.DataArray
1215
        Index (or coordinate if `coord` is not False) of first item in the first valid run.
1216
        Returns np.nan if there are no valid runs.
1217
    """
1218
    mid_idx = index_of_date(da[dim], date, max_idxs=1, default=0)
9✔
1219
    if mid_idx.size == 0:  # The date is not within the group. Happens at boundaries.
9✔
1220
        return xr.full_like(da.isel({dim: 0}), np.nan, float).drop_vars(dim)
9✔
1221

1222
    return first_run(
9✔
1223
        da.where(da[dim] >= da[dim][mid_idx][0]),
1224
        window=window,
1225
        dim=dim,
1226
        coord=coord,
1227
    )
1228

1229

1230
def last_run_before_date(
9✔
1231
    da: xr.DataArray,
1232
    window: int,
1233
    date: DayOfYearStr = "07-01",
1234
    dim: str = "time",
1235
    coord: bool | str | None = "dayofyear",
1236
) -> xr.DataArray:
1237
    """
1238
    Return the index of the last item of the last run before a given date.
1239

1240
    Parameters
1241
    ----------
1242
    da : xr.DataArray
1243
        Input N-dimensional DataArray (boolean).
1244
    window : int
1245
        Minimum duration of consecutive run to accumulate values.
1246
    date : DayOfYearStr
1247
        The date before which to look for the last event.
1248
    dim : str
1249
        Dimension along which to calculate consecutive run (default: 'time').
1250
    coord : Optional[Union[bool, str]]
1251
        If not False, the function returns values along `dim` instead of indexes.
1252
        If `dim` has a datetime dtype, `coord` can also be a str of the name of the
1253
        DateTimeAccessor object to use (ex: 'dayofyear').
1254

1255
    Returns
1256
    -------
1257
    xr.DataArray
1258
        Index (or coordinate if `coord` is not False) of last item in last valid run.
1259
        Returns np.nan if there are no valid runs.
1260
    """
1261
    mid_idx = index_of_date(da[dim], date, default=-1)
9✔
1262

1263
    if mid_idx.size == 0:  # The date is not within the group. Happens at boundaries.
9✔
1264
        return xr.full_like(da.isel({dim: 0}), np.nan, float).drop_vars(dim)
9✔
1265

1266
    run = da.where(da[dim] <= da[dim][mid_idx][0])
9✔
1267
    return last_run(run, window=window, dim=dim, coord=coord)
9✔
1268

1269

1270
def first_run_before_date(
9✔
1271
    da: xr.DataArray,
1272
    window: int,
1273
    date: DayOfYearStr | None = "07-01",
1274
    dim: str = "time",
1275
    coord: bool | str | None = "dayofyear",
1276
) -> xr.DataArray:
1277
    """
1278
    Return the index of the first item of the first run before a given date.
1279

1280
    Parameters
1281
    ----------
1282
    da : xr.DataArray
1283
        Input N-dimensional DataArray (boolean).
1284
    window : int
1285
        Minimum duration of consecutive run to accumulate values.
1286
    date : DayOfYearStr
1287
        The date before which to look for the run.
1288
    dim : str
1289
        Dimension along which to calculate consecutive run (default: 'time').
1290
    coord : bool or str, optional
1291
        If not False, the function returns values along `dim` instead of indexes.
1292
        If `dim` has a datetime dtype, `coord` can also be a str of the name of the
1293
        DateTimeAccessor object to use (e.g. 'dayofyear').
1294

1295
    Returns
1296
    -------
1297
    xr.DataArray
1298
        Index (or coordinate if `coord` is not False) of first item in the first valid run.
1299
        Returns np.nan if there are no valid runs.
1300
    """
1301
    if date is not None:
9✔
1302
        mid_idx = index_of_date(da[dim], date, max_idxs=1, default=0)
9✔
1303
        if mid_idx.size == 0:  # The date is not within the group. Happens at boundaries.
9✔
1304
            return xr.full_like(da.isel({dim: 0}), np.nan, float).drop_vars(dim)
9✔
1305
        # Mask anything after the mid_date + window - 1
1306
        # Thus, the latest run possible can begin on the day just before mid_idx
1307
        da = da.where(da[dim] < da[dim][mid_idx + window - 1][0])
9✔
1308

1309
    return first_run(
9✔
1310
        da,
1311
        window=window,
1312
        dim=dim,
1313
        coord=coord,
1314
    )
1315

1316

1317
@njit
9✔
1318
def _rle_1d(ia):
9✔
1319
    y = ia[1:] != ia[:-1]  # pairwise unequal (string safe)
×
1320
    i = np.append(np.nonzero(y)[0], ia.size - 1)  # must include last element position
×
1321
    rl = np.diff(np.append(-1, i))  # run lengths
×
1322
    pos = np.cumsum(np.append(0, rl))[:-1]  # positions
×
1323
    return ia[i], rl, pos
×
1324

1325

1326
def rle_1d(
9✔
1327
    arr: int | float | bool | Sequence[int | float | bool],
1328
) -> tuple[np.array, np.array, np.array]:
1329
    """
1330
    Return the length, starting position and value of consecutive identical values.
1331

1332
    In opposition to py:func:`rle`, this is an actuel run length encoder.
1333

1334
    Parameters
1335
    ----------
1336
    arr : Sequence[Union[int, float, bool]]
1337
      Array of values to be parsed.
1338

1339
    Returns
1340
    -------
1341
    values : np.array
1342
        The values taken by arr over each run.
1343
    run lengths : np.array
1344
        The length of each run.
1345
    start position : np.array
1346
        The starting index of each run.
1347

1348
    Examples
1349
    --------
1350
    >>> from xclim.indices.run_length import rle_1d
1351
    >>> a = [1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3]
1352
    >>> rle_1d(a)
1353
    (array([1, 2, 3]), array([2, 4, 6]), array([0, 2, 6]))
1354
    """
1355
    ia = np.asarray(arr)
9✔
1356
    n = len(ia)
9✔
1357

1358
    if n == 0:
9✔
1359
        warn("run length array empty")
×
1360
        # Returning None makes some other 1d func below fail.
1361
        return np.array(np.nan), 0, np.array(np.nan)
×
1362
    return _rle_1d(ia)
9✔
1363

1364

1365
def first_run_1d(arr: Sequence[int | float], window: int) -> int | np.nan:
9✔
1366
    """
1367
    Return the index of the first item of a run of at least a given length.
1368

1369
    Parameters
1370
    ----------
1371
    arr : sequence of int or float
1372
        Input array.
1373
    window : int
1374
        Minimum duration of consecutive run to accumulate values.
1375

1376
    Returns
1377
    -------
1378
    int or np.nan
1379
        Index of first item in first valid run.
1380
        Returns np.nan if there are no valid runs.
1381
    """
1382
    v, rl, pos = rle_1d(arr)
9✔
1383
    ind = np.where(v * rl >= window, pos, np.inf).min()  # noqa
9✔
1384

1385
    if np.isinf(ind):
9✔
1386
        return np.nan
9✔
1387
    return ind
9✔
1388

1389

1390
def statistics_run_1d(arr: Sequence[bool], reducer: str, window: int) -> int:
9✔
1391
    """
1392
    Return statistics on lengths of run of identical values.
1393

1394
    Parameters
1395
    ----------
1396
    arr : sequence of bool
1397
        Input array (bool).
1398
    reducer : {"mean", "sum", "min", "max", "std", "count"}
1399
        Reducing function name.
1400
    window : int
1401
        Minimal length of runs to be included in the statistics.
1402

1403
    Returns
1404
    -------
1405
    int
1406
        Statistics on length of runs.
1407
    """
1408
    v, rl = rle_1d(arr)[:2]
9✔
1409
    if not np.any(v) or np.all(v * rl < window):
9✔
1410
        return 0
9✔
1411
    if reducer == "count":
9✔
1412
        return (v * rl >= window).sum()
9✔
1413
    func = getattr(np, f"nan{reducer}")
9✔
1414
    return func(np.where(v * rl >= window, rl, np.nan))
9✔
1415

1416

1417
def windowed_run_count_1d(arr: Sequence[bool], window: int) -> int:
9✔
1418
    """
1419
    Return the number of consecutive true values in array for runs at least as long as given duration.
1420

1421
    Parameters
1422
    ----------
1423
    arr : Sequence[bool]
1424
        Input array (bool).
1425
    window : int
1426
        Minimum duration of consecutive run to accumulate values.
1427

1428
    Returns
1429
    -------
1430
    int
1431
        Total number of true values part of a consecutive run at least `window` long.
1432
    """
1433
    v, rl = rle_1d(arr)[:2]
9✔
1434
    return np.where(v * rl >= window, rl, 0).sum()
9✔
1435

1436

1437
def windowed_run_events_1d(arr: Sequence[bool], window: int) -> xr.DataArray:
9✔
1438
    """
1439
    Return the number of runs of a minimum length.
1440

1441
    Parameters
1442
    ----------
1443
    arr : Sequence[bool]
1444
        Input array (bool).
1445
    window : int
1446
        Minimum run length.
1447

1448
    Returns
1449
    -------
1450
    xr.DataArray, [int]
1451
        Number of distinct runs of a minimum length.
1452
    """
1453
    v, rl, _ = rle_1d(arr)
9✔
1454
    return (v * rl >= window).sum()
9✔
1455

1456

1457
def windowed_run_count_ufunc(x: xr.DataArray | Sequence[bool], window: int, dim: str) -> xr.DataArray:
9✔
1458
    """
1459
    Dask-parallel version of windowed_run_count_1d.
1460

1461
    The number of consecutive true values in array for runs at least as long as given duration.
1462

1463
    Parameters
1464
    ----------
1465
    x : Sequence[bool]
1466
        Input array (bool).
1467
    window : int
1468
        Minimum duration of consecutive run to accumulate values.
1469
    dim : str
1470
        Dimension along which to calculate windowed run.
1471

1472
    Returns
1473
    -------
1474
    xr.DataArray
1475
        A function operating along the time dimension of a dask-array.
1476
    """
1477
    return xr.apply_ufunc(
9✔
1478
        windowed_run_count_1d,
1479
        x,
1480
        input_core_dims=[[dim]],
1481
        vectorize=True,
1482
        dask="parallelized",
1483
        output_dtypes=[int],
1484
        keep_attrs=True,
1485
        kwargs={"window": window},
1486
    )
1487

1488

1489
def windowed_run_events_ufunc(x: xr.DataArray | Sequence[bool], window: int, dim: str) -> xr.DataArray:
9✔
1490
    """
1491
    Dask-parallel version of windowed_run_events_1d.
1492

1493
    The number of runs at least as long as given duration.
1494

1495
    Parameters
1496
    ----------
1497
    x : Sequence[bool]
1498
        Input array (bool).
1499
    window : int
1500
        Minimum run length.
1501
    dim : str
1502
        Dimension along which to calculate windowed run.
1503

1504
    Returns
1505
    -------
1506
    xr.DataArray
1507
        A function operating along the time dimension of a dask-array.
1508
    """
1509
    return xr.apply_ufunc(
9✔
1510
        windowed_run_events_1d,
1511
        x,
1512
        input_core_dims=[[dim]],
1513
        vectorize=True,
1514
        dask="parallelized",
1515
        output_dtypes=[int],
1516
        keep_attrs=True,
1517
        kwargs={"window": window},
1518
    )
1519

1520

1521
def statistics_run_ufunc(
9✔
1522
    x: xr.DataArray | Sequence[bool],
1523
    reducer: str,
1524
    window: int,
1525
    dim: str = "time",
1526
) -> xr.DataArray:
1527
    """
1528
    Dask-parallel version of statistics_run_1d.
1529

1530
    The {reducer} number of consecutive true values in array.
1531

1532
    Parameters
1533
    ----------
1534
    x : sequence of bool
1535
        Input array (bool).
1536
    reducer : {'min', 'max', 'mean', 'sum', 'std'}
1537
        Reducing function name.
1538
    window : int
1539
        Minimal length of runs.
1540
    dim : str
1541
        The dimension along which the runs are found.
1542

1543
    Returns
1544
    -------
1545
    xr.DataArray
1546
        A function operating along the time dimension of a dask-array.
1547
    """
1548
    return xr.apply_ufunc(
9✔
1549
        statistics_run_1d,
1550
        x,
1551
        input_core_dims=[[dim]],
1552
        kwargs={"reducer": reducer, "window": window},
1553
        vectorize=True,
1554
        dask="parallelized",
1555
        output_dtypes=[float],
1556
        keep_attrs=True,
1557
    )
1558

1559

1560
def first_run_ufunc(
9✔
1561
    x: xr.DataArray | Sequence[bool],
1562
    window: int,
1563
    dim: str,
1564
) -> xr.DataArray:
1565
    """
1566
    Dask-parallel version of first_run_1d.
1567

1568
    The first entry in array of consecutive true values.
1569

1570
    Parameters
1571
    ----------
1572
    x : Union[xr.DataArray, Sequence[bool]]
1573
        Input array (bool).
1574
    window : int
1575
        Minimum run length.
1576
    dim : str
1577
        The dimension along which the runs are found.
1578

1579
    Returns
1580
    -------
1581
    xr.DataArray
1582
        A function operating along the time dimension of a dask-array.
1583
    """
1584
    ind = xr.apply_ufunc(
9✔
1585
        first_run_1d,
1586
        x,
1587
        input_core_dims=[[dim]],
1588
        vectorize=True,
1589
        dask="parallelized",
1590
        output_dtypes=[float],
1591
        keep_attrs=True,
1592
        kwargs={"window": window},
1593
    )
1594

1595
    return ind
9✔
1596

1597

1598
def lazy_indexing(da: xr.DataArray, index: xr.DataArray, dim: str | None = None) -> xr.DataArray:
9✔
1599
    """
1600
    Get values of `da` at indices `index` in a NaN-aware and lazy manner.
1601

1602
    Parameters
1603
    ----------
1604
    da : xr.DataArray
1605
        Input array. If not 1D, `dim` must be given and must not appear in index.
1606
    index : xr.DataArray
1607
        N-d integer indices, if DataArray is not 1D, all dimensions of index must be in DataArray.
1608
    dim : str, optional
1609
        Dimension along which to index, unused if `da` is 1D, should not be present in `index`.
1610

1611
    Returns
1612
    -------
1613
    xr.DataArray
1614
        Values of `da` at indices `index`.
1615
    """
1616
    if da.ndim == 1:
9✔
1617
        # Case where da is 1D and index is N-D
1618
        # Slightly better performance using map_blocks, over an apply_ufunc
1619
        def _index_from_1d_array(indices, array):
9✔
1620
            return array[indices]
9✔
1621

1622
        idx_ndim = index.ndim
9✔
1623
        if idx_ndim == 0:
9✔
1624
            # The 0-D index case, we add a dummy dimension to help dask
1625
            dim = get_temp_dimname(da.dims, "x")
9✔
1626
            index = index.expand_dims(dim)
9✔
1627
        # Which indexes to mask.
1628
        invalid = index.isnull()
9✔
1629
        # NaN-indexing doesn't work, so fill with 0 and cast to int
1630
        index = index.fillna(0).astype(int)
9✔
1631

1632
        # No need for coords, we extract by integer index.
1633
        # Renaming with no name to fix bug in xr 2024.01.0
1634
        tmpname = get_temp_dimname(da.dims, "temp")
9✔
1635
        da2 = xr.DataArray(da.data, dims=(tmpname,), name=None)
9✔
1636
        # Map blocks chunks aux coords. Remove them to avoid the alignment check load in `where`
1637
        index, auxcrd = split_auxiliary_coordinates(index)
9✔
1638
        # for each chunk of index, take corresponding values from da
1639
        out = index.map_blocks(_index_from_1d_array, args=(da2,)).rename(da.name)
9✔
1640
        # mask where index was NaN. Drop any auxiliary coord, they are already on `out`.
1641
        # Chunked aux coord would have the same name on both sides and xarray will want to check if they are equal,
1642
        # which means loading them making lazy_indexing not lazy. same issue as above
1643
        out = out.where(~invalid.drop_vars([crd for crd in invalid.coords if crd not in invalid.dims]))
9✔
1644
        out = out.assign_coords(auxcrd.coords)
9✔
1645
        if idx_ndim == 0:
9✔
1646
            # 0-D case, drop useless coords and dummy dim
1647
            out = out.drop_vars(da.dims[0], errors="ignore").squeeze()
9✔
1648
        return out.drop_vars(dim or da.dims[0], errors="ignore")
9✔
1649

1650
    # Case where index.dims is a subset of da.dims.
1651
    if dim is None:
9✔
1652
        diff_dims = set(da.dims) - set(index.dims)
9✔
1653
        if len(diff_dims) == 0:
9✔
1654
            raise ValueError("da must have at least one dimension more than index for lazy_indexing.")
9✔
1655
        if len(diff_dims) > 1:
9✔
1656
            raise ValueError(
9✔
1657
                "If da has more than one dimension more than index, the indexing dim must be given through `dim`"
1658
            )
1659
        dim = diff_dims.pop()
×
1660

1661
    def _index_from_nd_array(array, indices):
9✔
1662
        return np.take_along_axis(array, indices[..., np.newaxis], axis=-1)[..., 0]
9✔
1663

1664
    return xr.apply_ufunc(
9✔
1665
        _index_from_nd_array,
1666
        da,
1667
        index.astype(int),
1668
        input_core_dims=[[dim], []],
1669
        output_core_dims=[[]],
1670
        dask="parallelized",
1671
        output_dtypes=[da.dtype],
1672
    )
1673

1674

1675
def index_of_date(
9✔
1676
    time: xr.DataArray,
1677
    date: DateStr | DayOfYearStr | None,
1678
    max_idxs: int | None = None,
1679
    default: int = 0,
1680
) -> np.ndarray:
1681
    """
1682
    Get the index of a date in a time array.
1683

1684
    Parameters
1685
    ----------
1686
    time : xr.DataArray
1687
        An array of datetime values, any calendar.
1688
    date : DayOfYearStr or DateStr, optional
1689
        A string in the "yyyy-mm-dd" or "mm-dd" format.
1690
        If None, returns default.
1691
    max_idxs : int, optional
1692
        Maximum number of returned indexes.
1693
    default : int
1694
        Index to return if date is None.
1695

1696
    Returns
1697
    -------
1698
    numpy.ndarray
1699
        1D array of integers, indexes of `date` in `time`.
1700

1701
    Raises
1702
    ------
1703
    ValueError
1704
        If there are most instances of `date` in `time` than `max_idxs`.
1705
    """
1706
    if date is None:
9✔
1707
        return np.array([default])
9✔
1708
    if len(date.split("-")) == 2:
9✔
1709
        date = f"1840-{date}"
9✔
1710
        date = datetime.strptime(date, "%Y-%m-%d")
9✔
1711
        year_cond = True
9✔
1712
    else:
1713
        date = datetime.strptime(date, "%Y-%m-%d")
×
1714
        year_cond = time.dt.year == date.year
×
1715

1716
    idxs = np.where(year_cond & (time.dt.month == date.month) & (time.dt.day == date.day))[0]
9✔
1717
    if max_idxs is not None and idxs.size > max_idxs:
9✔
1718
        raise ValueError(f"More than {max_idxs} instance of date {date} found in the coordinate array.")
9✔
1719
    return idxs
9✔
1720

1721

1722
def suspicious_run_1d(
9✔
1723
    arr: np.ndarray,
1724
    window: int = 10,
1725
    op: Literal[">", "gt", "<", "lt", ">=", "ge", "<=", "le", "==", "eq", "!=", "ne"] = ">",
1726
    thresh: float | None = None,
1727
) -> np.ndarray:
1728
    """
1729
    Return `True` where the array contains a run of identical values.
1730

1731
    Parameters
1732
    ----------
1733
    arr : numpy.ndarray
1734
        Array of values to be parsed.
1735
    window : int
1736
        Minimum run length.
1737
    op : {">", "gt", "<", "lt", ">=", "ge", "<=", "le", "==", "eq", "!=", "ne"}
1738
        Operator for threshold comparison. Defaults to ">".
1739
    thresh : float, optional
1740
        Threshold compared against which values are checked for identical values.
1741

1742
    Returns
1743
    -------
1744
    numpy.ndarray
1745
        Whether the data points are part of a run of identical values or not.
1746
    """
1747
    v, rl, pos = rle_1d(arr)
9✔
1748
    sus_runs = rl >= window
9✔
1749
    if thresh is not None:
9✔
1750
        if op in {">", "gt"}:
9✔
1751
            sus_runs = sus_runs & (v > thresh)
9✔
1752
        elif op in {"<", "lt"}:
9✔
1753
            sus_runs = sus_runs & (v < thresh)
×
1754
        elif op in {"==", "eq"}:
9✔
1755
            sus_runs = sus_runs & (v == thresh)
9✔
1756
        elif op in {"!=", "ne"}:
×
1757
            sus_runs = sus_runs & (v != thresh)
×
1758
        elif op in {">=", "gteq", "ge"}:
×
1759
            sus_runs = sus_runs & (v >= thresh)
×
1760
        elif op in {"<=", "lteq", "le"}:
×
1761
            sus_runs = sus_runs & (v <= thresh)
×
1762
        else:
1763
            raise NotImplementedError(f"{op}")
×
1764

1765
    out = np.zeros_like(arr, dtype=bool)
9✔
1766
    for st, l in zip(pos[sus_runs], rl[sus_runs], strict=False):  # noqa: E741
9✔
1767
        out[st : st + l] = True  # noqa: E741
9✔
1768
    return out
9✔
1769

1770

1771
def suspicious_run(
9✔
1772
    arr: xr.DataArray,
1773
    dim: str = "time",
1774
    window: int = 10,
1775
    op: Literal[">", "gt", "<", "lt", ">=", "ge", "<=", "le", "==", "eq", "!=", "ne"] = ">",
1776
    thresh: float | None = None,
1777
) -> xr.DataArray:
1778
    """
1779
    Return `True` where the array contains has runs of identical values, vectorized version.
1780

1781
    In opposition to other run length functions, here the output has the same shape as the input.
1782

1783
    Parameters
1784
    ----------
1785
    arr : xr.DataArray
1786
        Array of values to be parsed.
1787
    dim : str
1788
        Dimension along which to check for runs (default: "time").
1789
    window : int
1790
        Minimum run length.
1791
    op : {">", "gt", "<", "lt", ">=", "ge", "<=", "le", "==", "eq", "!=", "ne"}
1792
        Operator for threshold comparison, defaults to ">".
1793
    thresh : float, optional
1794
        Threshold above which values are checked for identical values.
1795

1796
    Returns
1797
    -------
1798
    xarray.DataArray
1799
        A boolean array of the same shape as the input, indicating where runs of identical values are found.
1800
    """
1801
    return xr.apply_ufunc(
9✔
1802
        suspicious_run_1d,
1803
        arr,
1804
        input_core_dims=[[dim]],
1805
        output_core_dims=[[dim]],
1806
        vectorize=True,
1807
        dask="parallelized",
1808
        output_dtypes=[bool],
1809
        keep_attrs=True,
1810
        kwargs={"window": window, "op": op, "thresh": thresh},
1811
    )
1812

1813

1814
def _find_events(da_start, da_stop, data, window_start, window_stop):
9✔
1815
    """
1816
    Actual finding of events for each period.
1817

1818
    Get basic blocks to work with, our runs with holes and the lengths of those runs.
1819
    Series of ones indicating where we have continuous runs with pauses
1820
    not exceeding `window_stop`
1821
    """
1822
    runs = runs_with_holes(da_start, window_start, da_stop, window_stop)
9✔
1823

1824
    # Compute the length of freezing rain events
1825
    # I think int16 is safe enough, fillna first to suppress warning
1826
    ds = rle(runs).fillna(0).astype(np.int16).to_dataset(name="event_length")
9✔
1827
    # Time duration where the precipitation threshold is exceeded during an event
1828
    # (duration of complete run - duration of holes in the run )
1829
    ds["event_effective_length"] = _cumsum_reset(da_start.where(runs == 1), index="first", reset_on_zero=False).astype(
9✔
1830
        np.int16
1831
    )
1832

1833
    if data is not None:
9✔
1834
        # Ex: Cumulated precipitation in a given freezing rain event
1835
        ds["event_sum"] = _cumsum_reset(data.where(runs == 1), index="first", reset_on_zero=False)
9✔
1836

1837
    # Keep time as a variable, it will be used to keep start of events
1838
    ds["event_start"] = ds["time"].broadcast_like(ds)  # .astype(int)
9✔
1839
    # We convert to an integer for the filtering, time object won't do well in the apply_ufunc/vectorize
1840
    time_min = ds.event_start.min()
9✔
1841
    ds["event_start"] = ds.event_start.copy(
9✔
1842
        data=(ds.event_start - time_min).values.astype("timedelta64[s]").astype(int)
1843
    )
1844

1845
    # Filter events: Reduce time dimension
1846
    def _filter_events(da, rl, max_event_number):
9✔
1847
        out = np.full(max_event_number, np.nan)
9✔
1848
        events_start = da[rl > 0]
9✔
1849
        out[: len(events_start)] = events_start
9✔
1850
        return out
9✔
1851

1852
    # Dask inputs need to be told their length before computing anything.
1853
    max_event_number = int(np.ceil(da_start.time.size / (window_start + window_stop)))
9✔
1854
    ds = xr.apply_ufunc(
9✔
1855
        _filter_events,
1856
        ds,
1857
        ds.event_length,
1858
        input_core_dims=[["time"], ["time"]],
1859
        output_core_dims=[["event"]],
1860
        kwargs={"max_event_number": max_event_number},
1861
        dask_gufunc_kwargs={"output_sizes": {"event": max_event_number}},
1862
        dask="parallelized",
1863
        vectorize=True,
1864
    )
1865

1866
    # convert back start to a time
1867
    if time_min.dtype == "O":
9✔
1868
        # Can't add a numpy array of timedeltas to an array of cftime (because they have non-compatible dtypes)
1869
        # Also, we can't add cftime to NaTType. So we fill with negative timedeltas and mask them after the addition
1870
        # Also, pd.to_timedelta can only handle 1D input, so we vectorize
1871

1872
        def _get_start_cftime(deltas, time_min=None):
9✔
1873
            starts = time_min + pd.to_timedelta(deltas, "s").to_pytimedelta()
9✔
1874
            starts[starts < time_min] = np.nan
9✔
1875
            return starts
9✔
1876

1877
        ds["event_start"] = xr.apply_ufunc(
9✔
1878
            _get_start_cftime,
1879
            ds.event_start.fillna(-1),
1880
            input_core_dims=[("event",)],
1881
            output_core_dims=[("event",)],
1882
            vectorize=True,
1883
            dask="parallelized",
1884
            kwargs={"time_min": time_min.item()},
1885
            output_dtypes=[time_min.dtype],
1886
        )
1887
    else:
1888
        ds["event_start"] = ds.event_start.copy(data=time_min.values + ds.event_start.data.astype("timedelta64[s]"))
9✔
1889

1890
    ds["event"] = np.arange(1, ds.event.size + 1)
9✔
1891
    ds["event_length"].attrs["units"] = "1"
9✔
1892
    ds["event_effective_length"].attrs["units"] = "1"
9✔
1893
    ds["event_start"].attrs["units"] = ""
9✔
1894
    if data is not None:
9✔
1895
        ds["event_sum"].attrs["units"] = data.units
9✔
1896
    return ds
9✔
1897

1898

1899
# TODO: Implement more event stats ? (max, effective sum, etc)
1900
def find_events(
9✔
1901
    condition: xr.DataArray,
1902
    window: int,
1903
    condition_stop: xr.DataArray | None = None,
1904
    window_stop: int = 1,
1905
    data: xr.DataArray | None = None,
1906
    freq: str | None = None,
1907
):
1908
    """
1909
    Find events (runs).
1910

1911
    An event starts with a run of ``window`` consecutive True values in the condition
1912
    and stops with ``window_stop`` consecutive True values in the stop condition.
1913

1914
    This returns a Dataset with each event along an `event` dimension.
1915
    It does not perform statistics over the events like other function in this module do.
1916

1917
    Parameters
1918
    ----------
1919
    condition : DataArray of bool
1920
        The boolean mask, true where the start condition of the event is fulfilled.
1921
    window : int
1922
        The number of consecutive True values for an event to start.
1923
    condition_stop : DataArray of bool, optional
1924
        The stopping boolean mask, true where the end condition of the event is fulfilled.
1925
        Defaults to the opposite of `condition`.
1926
    window_stop : int
1927
        The number of consecutive True values in ``condition_stop`` for an event to end.
1928
        Defaults to 1.
1929
    data : DataArray, optional
1930
        The actual data. If present, its sum within each event is added to the output.
1931
    freq : str, optional
1932
        A frequency to divide the data into periods. If absent, the output has not time dimension.
1933
        If given, the events are searched within in each resample period independently.
1934

1935
    Returns
1936
    -------
1937
    xr.Dataset, same shape as the data (and the time dimension is resample or removed, according to ``freq``).
1938
        The Dataset has the following variables:
1939
            event_length: The number of time steps in each event
1940
            event_effective_length: The number of time steps of even event where the start condition is true.
1941
            event_start: The datetime of the start of the run.
1942
            event_sum: The sum within each event, only considering steps where start condition is true (if ``data``).
1943
    """
1944
    if condition_stop is None:
9✔
1945
        condition_stop = ~condition
9✔
1946

1947
    if freq is None:
9✔
1948
        return _find_events(condition, condition_stop, data, window, window_stop)
9✔
1949

1950
    ds = xr.Dataset({"da_start": condition, "da_stop": condition_stop})
9✔
1951
    if data is not None:
9✔
1952
        ds = ds.assign(data=data)
9✔
1953
    return ds.resample(time=freq).map(
9✔
1954
        lambda grp: _find_events(grp.da_start, grp.da_stop, grp.get("data", None), window, window_stop)
1955
    )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc