iridl · remicousin · Jan 30, 2026 · Aug 27, 2025 · Aug 27, 2025 · Nov 13, 2025
diff --git a/pepsico/app_calc.py b/pepsico/app_calc.py
@@ -214,7 +214,7 @@ def daily_tobegroupedby_season(
 
     Parameters
     -----------
-    daily_data : DataArray
+    daily_data : xr.DataArray
         Daily data to be grouped.
     start_day : int
         Day of the start date  of the season.
@@ -293,7 +293,7 @@ def sel_day_and_month(daily_dim, day, month, offset=0):
 
     Parameters
     ----------
-    daily_dim : DataArray[datetime64[ns]]
+    daily_dim : xr.DataArray[datetime64[ns]]
         A daily time dimension.
     day : int
         day of the `month`.
@@ -305,7 +305,7 @@ def sel_day_and_month(daily_dim, day, month, offset=0):
 
     Returns
     -------
-    DataArray[datetime64[ns]]
+    xr.DataArray[datetime64[ns]]
         a subset of `daily_dim` with all and only `day`-`month` points, offset by `offset` days.
 
     See Also
@@ -375,3 +375,251 @@ def strftimeb2int(strftimeb):
     }
     strftimebint = strftimeb_all[strftimeb]
     return strftimebint
+
+
+def _cumsum_flagged_diff(flagged_data, dim):
+    """ Discrete difference of cumulative flags keeping only the unflagged data
+
+    Last 0s before a 1 return the count of consecutive 1s after that 0, except for
+    the first value that will return 0 if 0, and the number of consecutive 1s if 1,
+    and the last value that is dropped.
+    The result is expressed in the units of `dim`, that is assumed evenly spaced.
+
+    Parameters
+    ----------
+    flagged_data : xr.DataArray
+        Array of flagged data (0s or 1s)
+    dim : str
+        dimension of `flagged_data` along which to search for runs
+
+    Returns
+    -------
+    xr.DataArray
+        Array of lengths of spells of flags along `dim`
+
+    See Also
+    --------
+    count_days_in_spell, length_of_longest_spell, mean_length_of_spell,
+    median_length_of_spell
+
+    Notes
+    -----
+    Because diff needs at least 2 points,
+    we need to keep (where) the unflagged and first and last
+    with the cumulative value for last and 0 for first.
+    Cumulative flags, where kept, need be propagated by bfill
+    so that diff returns 0 or the length of runs.
+
+    I believe that it works for unevenly spaced `dim`,
+    only we then don't know what the units of the result are.
+
+    Is meant to be further reduced to for instance find the longest spell, or the
+    numbers of days in spells, etc.
+
+    Examples
+    --------
+    >>> t = pd.date_range(start="2000-05-01", end="2000-05-29", freq="1D")
+    >>> values = [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
+    ... 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0]
+    >>> flags = xr.DataArray(values, dims=["T"], coords={"T": t})
+    >>> _cumsum_flagged_diff(flags, "T")
+    <xarray.DataArray ()>
+    array(0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0,
+    ... 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 2, 0, 0, 0)    
+    """
+
+    # Special case coord.size = 1
+    cfd = flagged_data
+    if cfd[dim].size != 1:
+        # Points to apply diff to
+        unflagged_and_ends = (flagged_data == 0) * 1
+        unflagged_and_ends[{dim: [0, -1]}] = 1
+
+        cfd = cfd.cumsum(dim=dim).where(unflagged_and_ends, other = np.nan).where(
+            # first cumul point must be set to 0
+            lambda x: x[dim] != cfd[dim][0], other=0
+        ).bfill(dim).diff(dim)
+    return cfd
+
+def count_days_in_spells(flagged_data, dim, min_spell_length=1):
+    """ Counts number of days in spells.
+
+    Parameters
+    ----------
+    flagged_data : xr.DataArray
+        Array of flagged data (0s or 1s)
+    dim : str
+        dimension of `flagged_data` along which to count
+    min_spell_length: int, optional
+        minimum length of for a spell to be accounted for
+
+    Returns
+    -------
+    xr.DataArray
+        Array of number of days found in spells along `dim`
+
+    See Also
+    --------
+    _cumsum_flagged_diff
+
+    Examples
+    --------   
+    """
+
+    return _cumsum_flagged_diff(flagged_data, dim).where(
+        lambda x : x >= min_spell_length
+    ).sum(dim=dim)
+
+def length_of_longest_spell(flagged_data, dim):
+    """ Length of longest spells.
+
+    Parameters
+    ----------
+    flagged_data : xr.DataArray
+        Array of flagged data (0s or 1s)
+    dim : str
+        dimension of `flagged_data` along which to count
+
+    Returns
+    -------
+    xr.DataArray
+        Array of lengths of longest spells along `dim`
+
+    See Also
+    --------
+    _cumsum_flagged_diff
+
+    Examples
+    --------   
+    """
+
+    return _cumsum_flagged_diff(flagged_data, dim).max(dim=dim)
+
+def mean_length_of_spells(flagged_data, dim, min_spell_length=1):
+    """ Mean length of spells.
+
+    Parameters
+    ----------
+    flagged_data : xr.DataArray
+        Array of flagged data (0s or 1s)
+    dim : str
+        dimension of `flagged_data` along which to count
+    min_spell_length: int, optional
+        minimum length of for a spell to be accounted for
+
+    Returns
+    -------
+    xr.DataArray
+        Array of mean length of spells along `dim`
+
+    See Also
+    --------
+    _cumsum_flagged_diff
+
+    Examples
+    --------   
+    """
+
+    return _cumsum_flagged_diff(flagged_data, dim).where(
+        lambda x : x >= min_spell_length
+    ).mean(dim=dim)
+
+def median_length_of_spells(flagged_data, dim, min_spell_length=1):
+    """ Median length of spells.
+
+    Parameters
+    ----------
+    flagged_data : xr.DataArray
+        Array of flagged data (0s or 1s)
+    dim : str
+        dimension of `flagged_data` along which to count
+    min_spell_length: int, optional
+        minimum length of for a spell to be accounted for
+
+    Returns
+    -------
+    xr.DataArray
+        Array of median length of spells along `dim`
+
+    See Also
+    --------
+    _cumsum_flagged_diff
+
+    Examples
+    --------   
+    """
+
+    return _cumsum_flagged_diff(flagged_data, dim).where(
+        lambda x : x >= min_spell_length
+    ).median(dim=dim)
+
+
+def _accumulate_spells(flagged_data, axis=0, dtype=None, out=None):
+    """Accumulates continuous flags. Count reset to 0 when new data is unflagged
+
+    Parameters
+    ----------
+    flagged_data : xr.DataArray
+        Array of flagged data (0s or 1s)
+    axis : int, optional
+        The axis along which to apply the accumulation; default is zero.
+    dtype : data-type code, optional
+        The data-type used to represent the intermediate results. Defaults to the 
+        data-type of the output array if such is provided, or the data-type of the 
+        input array if no output array is provided.
+    out : ndarray, None, or tuple of ndarray and None, optional
+        Location into which the result is stored. If not provided or None, a 
+        freshly-allocated array is returned. For consistency with ufunc.__call__, if 
+        passed as a keyword argument, can be Ellipses (out=..., which has the same 
+        effect as None as an array is always returned), or a 1-element tuple.
+
+    Returns
+    -------
+    xr.DataArray
+        Accumulated continuous flags. Count reset to 0 when new data is unflagged.
+
+    See Also
+    --------
+    numpy.ufunc.accumulate, spells_length
+    """
+    return xr.apply_ufunc(
+        np.frompyfunc(lambda x, y: 0 if y == 0 else x + y, 2, 1).accumulate,
+        flagged_data, axis, dtype, out
+    )
+
+
+def spells_length(flagged_data, dim):
+    """Returns length of spell of flagged data at the last point of the spell, nan 
+    everywhere else
+
+    Parameters
+    ----------
+    flagged_data : xr.DataArray
+        Array of flagged data (0s or 1s)
+    dim : str
+        dimension of `flagged_data` along which to count
+
+    Returns
+    -------
+    xr.DataArray
+        Length of spell of flagged data at the last point of the spell, nan 
+        everywhere else
+
+    See Also
+    --------
+    _accumulate_spells
+    """
+    # cumuls 1s and resets counts when 0
+    # then rolls data by 1 to position total cumuls on 0s of the flagged data
+    # except for the last point that becomes first
+    spells = _accumulate_spells(
+        flagged_data, axis=flagged_data.get_axis_num(dim)
+    ).roll(**{dim: 1})
+    # masks out where data flag to rid of accumulating values but last of each spell
+    spells_only = spells.where(flagged_data == 0)
+    # resets first value to what it was as it could have been erased in previous step
+    spells_only[dict(**{dim : 0})] = spells.isel({dim : 0})
+    # rolls back to get spells length values on last day of spell
+    spells = spells_only.roll(**{dim: -1})
+    # Turns remaining 0s to NaN
+    return spells.where(spells > 0)
diff --git a/pepsico/test/test_app_calc.py b/pepsico/test/test_app_calc.py
@@ -0,0 +1,110 @@
+import numpy as np
+import pandas as pd
+import xarray as xr
+import app_calc
+
+
+def test__cumsum_flagged_diff():
+
+    t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
+    values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0])
+    precip = xr.DataArray(values, coords={"T": t})
+    spells = app_calc._cumsum_flagged_diff(precip, "T")
+
+    np.testing.assert_array_equal(spells, [1, 2, 0, 0, 0, 3, 0, 0, 0])
+
+
+def test__cumsum_flagged_diff_where_last_is_flagged():
+
+    t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
+    values = np.array([1, 0, 1, 1, 0, 1, 1, 1, 1, 1])
+    precip = xr.DataArray(values, coords={"T": t})
+    spells = app_calc._cumsum_flagged_diff(precip, "T")
+
+    np.testing.assert_array_equal(spells, [1, 2, 0, 0, 5, 0, 0, 0, 0])
+
+
+def test__cumsum_flagged_diff_all_flagged():
+
+    t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
+    values = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
+    precip = xr.DataArray(values, coords={"T": t})
+    spells = app_calc._cumsum_flagged_diff(precip, "T")
+
+    np.testing.assert_array_equal(spells, [10, 0, 0, 0, 0, 0, 0, 0, 0])
+
+
+def test__cumsum_flagged_diff_all_unflagged():
+
+    t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
+    values = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+    precip = xr.DataArray(values, coords={"T": t})
+    spells = app_calc._cumsum_flagged_diff(precip, "T")
+
+    np.testing.assert_array_equal(spells, [0, 0, 0, 0, 0, 0, 0, 0, 0])
+
+
+def test__cumsum_flagged_diff_coord_size_1():
+
+    t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
+    values = np.array([1, 0, 1, 1, 0, 1, 1, 1, 0, 0])
+    precip = xr.DataArray(values, coords={"T": t})
+    precip_0_1 = precip[0]
+    precip_0_0 = precip_0_1 - 1
+    spells1 = app_calc._cumsum_flagged_diff(precip_0_1, "T")
+    spells0 = app_calc._cumsum_flagged_diff(precip_0_0, "T")
+
+    assert spells1 == 1
+    assert spells0 == 0
+
+
+def test_spells_length():
+
+    t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
+    values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0])
+    precip = xr.DataArray(values, coords={"T": t})
+    spells = app_calc.spells_length(precip, "T")
+    expected = [1, np.nan, np.nan, 2, np.nan, np.nan, np.nan, np.nan, 3, np.nan]
+
+    np.testing.assert_array_equal(spells.isnull(), np.isnan(expected))
+    np.testing.assert_array_equal(
+        spells.where(~spells.isnull(), other=0),
+        np.where(np.isnan(expected), 0, expected)
+    )
+
+
+def test_count_days_in_spells():
+
+    t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
+    values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0])
+    precip = xr.DataArray(values, coords={"T": t})
+    days_in_spells = app_calc.count_days_in_spells(precip, "T", min_spell_length=2)
+
+    assert days_in_spells == 5
+
+def test_length_of_longest_spell():
+
+    t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
+    values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0])
+    precip = xr.DataArray(values, coords={"T": t})
+    longest_spell = app_calc.length_of_longest_spell(precip, "T")
+
+    assert longest_spell == 3
+
+def test_mean_length_of_spells():
+
+    t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
+    values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0])
+    precip = xr.DataArray(values, coords={"T": t})
+    mean_spell = app_calc.mean_length_of_spells(precip, "T", min_spell_length=2)
+
+    assert mean_spell == 2.5
+
+def test_median_length_of_spells():
+
+    t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
+    values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0])
+    precip = xr.DataArray(values, coords={"T": t})
+    mendian_spell = app_calc.median_length_of_spells(precip, "T")
+
+    assert mendian_spell == 2