From 083b6f15bcc9e49ce8848cd713600d1a75e4558c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Cousin?= <remic@iri.columbia.edu>
Date: Wed, 27 Aug 2025 13:20:05 -0400
Subject: [PATCH 1/4] background function to identify spells, inspired from
 enacts/calc.pu

---
 pepsico/app_calc.py           | 64 +++++++++++++++++++++++++++++++++++
 pepsico/test/test_app_calc.py | 58 +++++++++++++++++++++++++++++++
 2 files changed, 122 insertions(+)
 create mode 100644 pepsico/test/test_app_calc.py

diff --git a/pepsico/app_calc.py b/pepsico/app_calc.py
index a52b860f..4cacdf9c 100644
--- a/pepsico/app_calc.py
+++ b/pepsico/app_calc.py
@@ -375,3 +375,67 @@ def strftimeb2int(strftimeb):
     }
     strftimebint = strftimeb_all[strftimeb]
     return strftimebint
+
+
+def _cumsum_flagged_diff(flagged_data, dim):
+    """ Discrete difference of cumulative flags keeping only the unflagged data
+    
+    Last 0s before a 1 return the count of consecutive 1s after that 0, except for
+    the first value that will return 0 if 0, and the number of consecutive 1s if 1,
+    and the last value that is dropped.
+    The result is expressed in the units of `dim`, that is assumed evenly spaced.
+    
+    Parameters
+    ----------
+    flagged_data : DataArray
+        Array of flagged data (0s or 1s)
+    dim : str
+        dimension of `flagged_data` along which to search for runs
+        
+    Returns
+    -------
+    DataArray
+        Array of lengths of spells of flags along `dim`
+        
+    See Also
+    --------
+    
+    Notes
+    -----
+    Because diff needs at least 2 points,
+    we need to keep (where) the unflagged and first and last
+    with the cumulative value for last and 0 for first.
+    Cumulative flags, where kept, need be propagated by bfill
+    so that diff returns 0 or the length of runs.
+    
+    I believe that it works for unevenly spaced `dim`,
+    only we then don't know what the units of the result are.
+
+    Is meant to be further reduced to for instance find the longest spell, or the
+    numbers of days in spells, etc.
+    
+    Examples
+    --------
+    >>> t = pd.date_range(start="2000-05-01", end="2000-05-29", freq="1D")
+    >>> values = [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
+    ... 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0]
+    >>> flags = xr.DataArray(values, dims=["T"], coords={"T": t})
+    >>> _cumsum_flagged_diff(flags, "T")
+    <xarray.DataArray ()>
+    array(0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0,
+    ... 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 2, 0, 0, 0)    
+    """
+    
+    # Special case coord.size = 1
+    lrl = flagged_data
+    if lrl[dim].size != 1:
+        # Points to apply diff to
+        unflagged_and_ends = (flagged_data == 0) * 1
+        unflagged_and_ends[{dim: [0, -1]}] = 1
+    
+        lrl = lrl.cumsum(dim=dim).where(unflagged_and_ends, other = np.nan).where(
+            # first cumul point must be set to 0
+            lambda x: x[dim] != lrl[dim][0], other=0
+        ).bfill(dim).diff(dim)
+    lrl.attrs = dict(description="Longest Run Length")
+    return lrl
diff --git a/pepsico/test/test_app_calc.py b/pepsico/test/test_app_calc.py
new file mode 100644
index 00000000..98b52782
--- /dev/null
+++ b/pepsico/test/test_app_calc.py
@@ -0,0 +1,58 @@
+import numpy as np
+import pandas as pd
+import xarray as xr
+import app_calc
+
+
+def test__cumsum_flagged_diff():
+
+    t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
+    values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0])
+    precip = xr.DataArray(values, coords={"T": t})
+    spells = app_calc._cumsum_flagged_diff(precip, "T")
+
+    np.testing.assert_array_equal(spells, [1, 2, 0, 0, 0, 3, 0, 0, 0])
+
+
+def test__cumsum_flagged_diff_where_last_is_flagged():
+
+    t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
+    values = np.array([1, 0, 1, 1, 0, 1, 1, 1, 1, 1])
+    precip = xr.DataArray(values, coords={"T": t})
+    spells = app_calc._cumsum_flagged_diff(precip, "T")
+
+    np.testing.assert_array_equal(spells, [1, 2, 0, 0, 5, 0, 0, 0, 0])
+
+
+def test__cumsum_flagged_diff_all_flagged():
+
+    t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
+    values = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
+    precip = xr.DataArray(values, coords={"T": t})
+    spells = app_calc._cumsum_flagged_diff(precip, "T")
+
+    np.testing.assert_array_equal(spells, [10, 0, 0, 0, 0, 0, 0, 0, 0])
+
+
+def test__cumsum_flagged_diff_all_unflagged():
+
+    t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
+    values = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+    precip = xr.DataArray(values, coords={"T": t})
+    spells = app_calc._cumsum_flagged_diff(precip, "T")
+
+    np.testing.assert_array_equal(spells, [0, 0, 0, 0, 0, 0, 0, 0, 0])
+
+
+def test__cumsum_flagged_diff_coord_size_1():
+
+    t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
+    values = np.array([1, 0, 1, 1, 0, 1, 1, 1, 0, 0])
+    precip = xr.DataArray(values, coords={"T": t})
+    precip_0_1 = precip[0]
+    precip_0_0 = precip_0_1 - 1
+    spells1 = app_calc._cumsum_flagged_diff(precip_0_1, "T")
+    spells0 = app_calc._cumsum_flagged_diff(precip_0_0, "T")
+
+    assert spells1 == 1
+    assert spells0 == 0

From a9cc143e3d395387b65fa9a4a045070f91aa3fae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Cousin?= <remic@iri.columbia.edu>
Date: Wed, 27 Aug 2025 13:57:38 -0400
Subject: [PATCH 2/4] spell related functions

---
 pepsico/app_calc.py           | 125 ++++++++++++++++++++++++++++++++--
 pepsico/test/test_app_calc.py |  37 ++++++++++
 2 files changed, 156 insertions(+), 6 deletions(-)

diff --git a/pepsico/app_calc.py b/pepsico/app_calc.py
index 4cacdf9c..51ad1422 100644
--- a/pepsico/app_calc.py
+++ b/pepsico/app_calc.py
@@ -399,6 +399,8 @@ def _cumsum_flagged_diff(flagged_data, dim):
         
     See Also
     --------
+    count_days_in_spell, length_of_longest_spell, mean_length_of_spell,
+    median_length_of_spell
     
     Notes
     -----
@@ -427,15 +429,126 @@ def _cumsum_flagged_diff(flagged_data, dim):
     """
     
     # Special case coord.size = 1
-    lrl = flagged_data
-    if lrl[dim].size != 1:
+    cfd = flagged_data
+    if cfd[dim].size != 1:
         # Points to apply diff to
         unflagged_and_ends = (flagged_data == 0) * 1
         unflagged_and_ends[{dim: [0, -1]}] = 1
     
-        lrl = lrl.cumsum(dim=dim).where(unflagged_and_ends, other = np.nan).where(
+        cfd = cfd.cumsum(dim=dim).where(unflagged_and_ends, other = np.nan).where(
             # first cumul point must be set to 0
-            lambda x: x[dim] != lrl[dim][0], other=0
+            lambda x: x[dim] != cfd[dim][0], other=0
         ).bfill(dim).diff(dim)
-    lrl.attrs = dict(description="Longest Run Length")
-    return lrl
+    return cfd
+
+def count_days_in_spells(flagged_data, dim, min_spell_length=1):
+    """ Counts number of days in spells.
+    
+    Parameters
+    ----------
+    flagged_data : DataArray
+        Array of flagged data (0s or 1s)
+    dim : str
+        dimension of `flagged_data` along which to count
+    min_spell_length: int, optional
+        minimum length of for a spell to be accounted for
+        
+    Returns
+    -------
+    DataArray
+        Array of number of days found in spells along `dim`
+        
+    See Also
+    --------
+    _cumsum_flagged_diff
+    
+    Examples
+    --------   
+    """
+
+    return _cumsum_flagged_diff(flagged_data, dim).where(
+        lambda x : x >= min_spell_length
+    ).sum(dim=dim)
+
+def length_of_longest_spell(flagged_data, dim):
+    """ Length of longest spells.
+    
+    Parameters
+    ----------
+    flagged_data : DataArray
+        Array of flagged data (0s or 1s)
+    dim : str
+        dimension of `flagged_data` along which to count
+        
+    Returns
+    -------
+    DataArray
+        Array of lengths of longest spells along `dim`
+        
+    See Also
+    --------
+    _cumsum_flagged_diff
+    
+    Examples
+    --------   
+    """
+
+    return _cumsum_flagged_diff(flagged_data, dim).max(dim=dim)
+
+def mean_length_of_spells(flagged_data, dim, min_spell_length=1):
+    """ Mean length of spells.
+    
+    Parameters
+    ----------
+    flagged_data : DataArray
+        Array of flagged data (0s or 1s)
+    dim : str
+        dimension of `flagged_data` along which to count
+    min_spell_length: int, optional
+        minimum length of for a spell to be accounted for
+        
+    Returns
+    -------
+    DataArray
+        Array of mean length of spells along `dim`
+        
+    See Also
+    --------
+    _cumsum_flagged_diff
+    
+    Examples
+    --------   
+    """
+
+    return _cumsum_flagged_diff(flagged_data, dim).where(
+        lambda x : x >= min_spell_length
+    ).mean(dim=dim)
+
+def median_length_of_spells(flagged_data, dim, min_spell_length=1):
+    """ Median length of spells.
+    
+    Parameters
+    ----------
+    flagged_data : DataArray
+        Array of flagged data (0s or 1s)
+    dim : str
+        dimension of `flagged_data` along which to count
+    min_spell_length: int, optional
+        minimum length of for a spell to be accounted for
+        
+    Returns
+    -------
+    DataArray
+        Array of median length of spells along `dim`
+        
+    See Also
+    --------
+    _cumsum_flagged_diff
+    
+    Examples
+    --------   
+    """
+
+    return _cumsum_flagged_diff(flagged_data, dim).where(
+        lambda x : x >= min_spell_length
+    ).median(dim=dim)
diff --git a/pepsico/test/test_app_calc.py b/pepsico/test/test_app_calc.py
index 98b52782..a8b312e5 100644
--- a/pepsico/test/test_app_calc.py
+++ b/pepsico/test/test_app_calc.py
@@ -56,3 +56,40 @@ def test__cumsum_flagged_diff_coord_size_1():
 
     assert spells1 == 1
     assert spells0 == 0
+
+def test_count_days_in_spells():
+
+    t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
+    values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0])
+    precip = xr.DataArray(values, coords={"T": t})
+    days_in_spells = app_calc.count_days_in_spells(precip, "T", min_spell_length=2)
+
+    assert days_in_spells == 5
+
+def test_length_of_longest_spell():
+
+    t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
+    values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0])
+    precip = xr.DataArray(values, coords={"T": t})
+    longest_spell = app_calc.length_of_longest_spell(precip, "T")
+
+    assert longest_spell == 3
+
+def test_mean_length_of_spells():
+
+    t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
+    values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0])
+    precip = xr.DataArray(values, coords={"T": t})
+    mean_spell = app_calc.mean_length_of_spells(precip, "T", min_spell_length=2)
+
+    assert mean_spell == 2.5
+
+def test_median_length_of_spells():
+
+    t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
+    values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0])
+    precip = xr.DataArray(values, coords={"T": t})
+    mendian_spell = app_calc.median_length_of_spells(precip, "T")
+
+    assert mendian_spell == 2
+    
\ No newline at end of file

From 11e8370d33abd5fa180265003549a62c17884a1b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Cousin?= <remic@iri.columbia.edu>
Date: Thu, 13 Nov 2025 11:54:12 -0500
Subject: [PATCH 3/4] alternative to _cumsum_flagged_diff

---
 pepsico/app_calc.py           | 24 ++++++++++++++++++++++++
 pepsico/test/test_app_calc.py | 17 ++++++++++++++++-
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/pepsico/app_calc.py b/pepsico/app_calc.py
index 51ad1422..8caf7f13 100644
--- a/pepsico/app_calc.py
+++ b/pepsico/app_calc.py
@@ -552,3 +552,27 @@ def median_length_of_spells(flagged_data, dim, min_spell_length=1):
     return _cumsum_flagged_diff(flagged_data, dim).where(
         lambda x : x >= min_spell_length
     ).median(dim=dim)
+
+
+def _accumulate_spells(flagged_data, axis=0, dtype=None, out=None):
+    return xr.apply_ufunc(
+        np.frompyfunc(lambda x, y: 0 if y == 0 else x + y, 2, 1).accumulate,
+        flagged_data, axis, dtype, out
+    )
+
+
+def spells_length(flagged_data, dim):
+    # cumuls 1s and resets counts when 0
+    # then rolls data by 1 to position total cumuls on 0s of the flagged data
+    # except for the last point that becomes first
+    spells = _accumulate_spells(
+        flagged_data, axis=flagged_data.get_axis_num(dim)
+    ).roll(**{dim: 1})
+    # masks out where data falg to rid of accumulating values but last of each spell
+    spells_only = spells.where(flagged_data == 0)
+    # resets first value to what it was as it could have been erased in previous step
+    spells_only[dict(**{dim : 0})] = spells.isel({dim : 0})
+    # rolls back to get spells length values on last day of spell
+    spells = spells_only.roll(**{dim: -1})
+    # Turns remaining 0s to NaN
+    return spells.where(spells > 0)
diff --git a/pepsico/test/test_app_calc.py b/pepsico/test/test_app_calc.py
index a8b312e5..62b22503 100644
--- a/pepsico/test/test_app_calc.py
+++ b/pepsico/test/test_app_calc.py
@@ -57,6 +57,22 @@ def test__cumsum_flagged_diff_coord_size_1():
     assert spells1 == 1
     assert spells0 == 0
 
+
+def test_spells_length():
+
+    t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
+    values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0])
+    precip = xr.DataArray(values, coords={"T": t})
+    spells = app_calc.spells_length(precip, "T")
+    expected = [1, np.nan, np.nan, 2, np.nan, np.nan, np.nan, np.nan, 3, np.nan]
+
+    np.testing.assert_array_equal(spells.isnull(), np.isnan(expected))
+    np.testing.assert_array_equal(
+        spells.where(~spells.isnull(), other=0),
+        np.where(np.isnan(expected), 0, expected)
+    )
+
+
 def test_count_days_in_spells():
 
     t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
@@ -92,4 +108,3 @@ def test_median_length_of_spells():
     mendian_spell = app_calc.median_length_of_spells(precip, "T")
 
     assert mendian_spell == 2
-    
\ No newline at end of file

From d758567cfcdcfb77e58521463272d933e6ff77f9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Cousin?= <remic@iri.columbia.edu>
Date: Fri, 30 Jan 2026 13:45:19 -0500
Subject: [PATCH 4/4] documentation

---
 pepsico/app_calc.py | 75 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 61 insertions(+), 14 deletions(-)

diff --git a/pepsico/app_calc.py b/pepsico/app_calc.py
index 8caf7f13..4e670792 100644
--- a/pepsico/app_calc.py
+++ b/pepsico/app_calc.py
@@ -214,7 +214,7 @@ def daily_tobegroupedby_season(
 
     Parameters
     -----------
-    daily_data : DataArray
+    daily_data : xr.DataArray
         Daily data to be grouped.
     start_day : int
         Day of the start date  of the season.
@@ -293,7 +293,7 @@ def sel_day_and_month(daily_dim, day, month, offset=0):
 
     Parameters
     ----------
-    daily_dim : DataArray[datetime64[ns]]
+    daily_dim : xr.DataArray[datetime64[ns]]
         A daily time dimension.
     day : int
         day of the `month`.
@@ -305,7 +305,7 @@ def sel_day_and_month(daily_dim, day, month, offset=0):
 
     Returns
     -------
-    DataArray[datetime64[ns]]
+    xr.DataArray[datetime64[ns]]
         a subset of `daily_dim` with all and only `day`-`month` points, offset by `offset` days.
     
     See Also
@@ -387,14 +387,14 @@ def _cumsum_flagged_diff(flagged_data, dim):
     
     Parameters
     ----------
-    flagged_data : DataArray
+    flagged_data : xr.DataArray
         Array of flagged data (0s or 1s)
     dim : str
         dimension of `flagged_data` along which to search for runs
         
     Returns
     -------
-    DataArray
+    xr.DataArray
         Array of lengths of spells of flags along `dim`
         
     See Also
@@ -446,7 +446,7 @@ def count_days_in_spells(flagged_data, dim, min_spell_length=1):
     
     Parameters
     ----------
-    flagged_data : DataArray
+    flagged_data : xr.DataArray
         Array of flagged data (0s or 1s)
     dim : str
         dimension of `flagged_data` along which to count
@@ -455,7 +455,7 @@ def count_days_in_spells(flagged_data, dim, min_spell_length=1):
         
     Returns
     -------
-    DataArray
+    xr.DataArray
         Array of number of days found in spells along `dim`
         
     See Also
@@ -475,14 +475,14 @@ def length_of_longest_spell(flagged_data, dim):
     
     Parameters
     ----------
-    flagged_data : DataArray
+    flagged_data : xr.DataArray
         Array of flagged data (0s or 1s)
     dim : str
         dimension of `flagged_data` along which to count
         
     Returns
     -------
-    DataArray
+    xr.DataArray
         Array of lengths of longest spells along `dim`
         
     See Also
@@ -500,7 +500,7 @@ def mean_length_of_spells(flagged_data, dim, min_spell_length=1):
     
     Parameters
     ----------
-    flagged_data : DataArray
+    flagged_data : xr.DataArray
         Array of flagged data (0s or 1s)
     dim : str
         dimension of `flagged_data` along which to count
@@ -509,7 +509,7 @@ def mean_length_of_spells(flagged_data, dim, min_spell_length=1):
         
     Returns
     -------
-    DataArray
+    xr.DataArray
         Array of mean length of spells along `dim`
         
     See Also
@@ -529,7 +529,7 @@ def median_length_of_spells(flagged_data, dim, min_spell_length=1):
     
     Parameters
     ----------
-    flagged_data : DataArray
+    flagged_data : xr.DataArray
         Array of flagged data (0s or 1s)
     dim : str
         dimension of `flagged_data` along which to count
@@ -538,7 +538,7 @@ def median_length_of_spells(flagged_data, dim, min_spell_length=1):
         
     Returns
     -------
-    DataArray
+    xr.DataArray
         Array of median length of spells along `dim`
         
     See Also
@@ -555,6 +555,33 @@ def median_length_of_spells(flagged_data, dim, min_spell_length=1):
 
 
 def _accumulate_spells(flagged_data, axis=0, dtype=None, out=None):
+    """Accumulates continuous flags. Count reset to 0 when new data is unflagged
+
+    Parameters
+    ----------
+    flagged_data : xr.DataArray
+        Array of flagged data (0s or 1s)
+    axis : int, optional
+        The axis along which to apply the accumulation; default is zero.
+    dtype : data-type code, optional
+        The data-type used to represent the intermediate results. Defaults to the 
+        data-type of the output array if such is provided, or the data-type of the 
+        input array if no output array is provided.
+    out : ndarray, None, or tuple of ndarray and None, optional
+        Location into which the result is stored. If not provided or None, a 
+        freshly-allocated array is returned. For consistency with ufunc.__call__, if 
+        passed as a keyword argument, can be Ellipses (out=..., which has the same 
+        effect as None as an array is always returned), or a 1-element tuple.
+    
+    Returns
+    -------
+    xr.DataArray
+        Accumulated continuous flags. Count reset to 0 when new data is unflagged.
+    
+    See Also
+    --------
+    numpy.ufunc.accumulate, spells_length
+    """
     return xr.apply_ufunc(
         np.frompyfunc(lambda x, y: 0 if y == 0 else x + y, 2, 1).accumulate,
         flagged_data, axis, dtype, out
@@ -562,13 +589,33 @@ def _accumulate_spells(flagged_data, axis=0, dtype=None, out=None):
 
 
 def spells_length(flagged_data, dim):
+    """Returns length of spell of flagged data at the last point of the spell, nan 
+    everywhere else
+
+    Parameters
+    ----------
+    flagged_data : xr.DataArray
+        Array of flagged data (0s or 1s)
+    dim : str
+        dimension of `flagged_data` along which to count
+    
+    Returns
+    -------
+    xr.DataArray
+        Length of spell of flagged data at the last point of the spell, nan 
+        everywhere else
+
+    See Also
+    --------
+    _accumulate_spells
+    """
     # cumuls 1s and resets counts when 0
     # then rolls data by 1 to position total cumuls on 0s of the flagged data
     # except for the last point that becomes first
     spells = _accumulate_spells(
         flagged_data, axis=flagged_data.get_axis_num(dim)
     ).roll(**{dim: 1})
-    # masks out where data falg to rid of accumulating values but last of each spell
+    # masks out where data flag to rid of accumulating values but last of each spell
     spells_only = spells.where(flagged_data == 0)
     # resets first value to what it was as it could have been erased in previous step
     spells_only[dict(**{dim : 0})] = spells.isel({dim : 0})