From 083b6f15bcc9e49ce8848cd713600d1a75e4558c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Cousin?= Date: Wed, 27 Aug 2025 13:20:05 -0400 Subject: [PATCH 1/4] background function to identify spells, inspired from enacts/calc.pu --- pepsico/app_calc.py | 64 +++++++++++++++++++++++++++++++++++ pepsico/test/test_app_calc.py | 58 +++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 pepsico/test/test_app_calc.py diff --git a/pepsico/app_calc.py b/pepsico/app_calc.py index a52b860f..4cacdf9c 100644 --- a/pepsico/app_calc.py +++ b/pepsico/app_calc.py @@ -375,3 +375,67 @@ def strftimeb2int(strftimeb): } strftimebint = strftimeb_all[strftimeb] return strftimebint + + +def _cumsum_flagged_diff(flagged_data, dim): + """ Discrete difference of cumulative flags keeping only the unflagged data + + Last 0s before a 1 return the count of consecutive 1s after that 0, except for + the first value that will return 0 if 0, and the number of consecutive 1s if 1, + and the last value that is dropped. + The result is expressed in the units of `dim`, that is assumed evenly spaced. + + Parameters + ---------- + flagged_data : DataArray + Array of flagged data (0s or 1s) + dim : str + dimension of `flagged_data` along which to search for runs + + Returns + ------- + DataArray + Array of lengths of spells of flags along `dim` + + See Also + -------- + + Notes + ----- + Because diff needs at least 2 points, + we need to keep (where) the unflagged and first and last + with the cumulative value for last and 0 for first. + Cumulative flags, where kept, need be propagated by bfill + so that diff returns 0 or the length of runs. + + I believe that it works for unevenly spaced `dim`, + only we then don't know what the units of the result are. + + Is meant to be further reduced to for instance find the longest spell, or the + numbers of days in spells, etc. + + Examples + -------- + >>> t = pd.date_range(start="2000-05-01", end="2000-05-29", freq="1D") + >>> values = [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, + ... 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0] + >>> flags = xr.DataArray(values, dims=["T"], coords={"T": t}) + >>> _cumsum_flagged_diff(flags, "T") + + array(0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, + ... 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 2, 0, 0, 0) + """ + + # Special case coord.size = 1 + lrl = flagged_data + if lrl[dim].size != 1: + # Points to apply diff to + unflagged_and_ends = (flagged_data == 0) * 1 + unflagged_and_ends[{dim: [0, -1]}] = 1 + + lrl = lrl.cumsum(dim=dim).where(unflagged_and_ends, other = np.nan).where( + # first cumul point must be set to 0 + lambda x: x[dim] != lrl[dim][0], other=0 + ).bfill(dim).diff(dim) + lrl.attrs = dict(description="Longest Run Length") + return lrl diff --git a/pepsico/test/test_app_calc.py b/pepsico/test/test_app_calc.py new file mode 100644 index 00000000..98b52782 --- /dev/null +++ b/pepsico/test/test_app_calc.py @@ -0,0 +1,58 @@ +import numpy as np +import pandas as pd +import xarray as xr +import app_calc + + +def test__cumsum_flagged_diff(): + + t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D") + values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0]) + precip = xr.DataArray(values, coords={"T": t}) + spells = app_calc._cumsum_flagged_diff(precip, "T") + + np.testing.assert_array_equal(spells, [1, 2, 0, 0, 0, 3, 0, 0, 0]) + + +def test__cumsum_flagged_diff_where_last_is_flagged(): + + t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D") + values = np.array([1, 0, 1, 1, 0, 1, 1, 1, 1, 1]) + precip = xr.DataArray(values, coords={"T": t}) + spells = app_calc._cumsum_flagged_diff(precip, "T") + + np.testing.assert_array_equal(spells, [1, 2, 0, 0, 5, 0, 0, 0, 0]) + + +def test__cumsum_flagged_diff_all_flagged(): + + t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D") + values = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) + precip = xr.DataArray(values, coords={"T": t}) + spells = app_calc._cumsum_flagged_diff(precip, "T") + + np.testing.assert_array_equal(spells, [10, 0, 0, 0, 0, 0, 0, 0, 0]) + + +def test__cumsum_flagged_diff_all_unflagged(): + + t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D") + values = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + precip = xr.DataArray(values, coords={"T": t}) + spells = app_calc._cumsum_flagged_diff(precip, "T") + + np.testing.assert_array_equal(spells, [0, 0, 0, 0, 0, 0, 0, 0, 0]) + + +def test__cumsum_flagged_diff_coord_size_1(): + + t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D") + values = np.array([1, 0, 1, 1, 0, 1, 1, 1, 0, 0]) + precip = xr.DataArray(values, coords={"T": t}) + precip_0_1 = precip[0] + precip_0_0 = precip_0_1 - 1 + spells1 = app_calc._cumsum_flagged_diff(precip_0_1, "T") + spells0 = app_calc._cumsum_flagged_diff(precip_0_0, "T") + + assert spells1 == 1 + assert spells0 == 0 From a9cc143e3d395387b65fa9a4a045070f91aa3fae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Cousin?= Date: Wed, 27 Aug 2025 13:57:38 -0400 Subject: [PATCH 2/4] spell related functions --- pepsico/app_calc.py | 125 ++++++++++++++++++++++++++++++++-- pepsico/test/test_app_calc.py | 37 ++++++++++ 2 files changed, 156 insertions(+), 6 deletions(-) diff --git a/pepsico/app_calc.py b/pepsico/app_calc.py index 4cacdf9c..51ad1422 100644 --- a/pepsico/app_calc.py +++ b/pepsico/app_calc.py @@ -399,6 +399,8 @@ def _cumsum_flagged_diff(flagged_data, dim): See Also -------- + count_days_in_spell, length_of_longest_spell, mean_length_of_spell, + median_length_of_spell Notes ----- @@ -427,15 +429,126 @@ def _cumsum_flagged_diff(flagged_data, dim): """ # Special case coord.size = 1 - lrl = flagged_data - if lrl[dim].size != 1: + cfd = flagged_data + if cfd[dim].size != 1: # Points to apply diff to unflagged_and_ends = (flagged_data == 0) * 1 unflagged_and_ends[{dim: [0, -1]}] = 1 - lrl = lrl.cumsum(dim=dim).where(unflagged_and_ends, other = np.nan).where( + cfd = cfd.cumsum(dim=dim).where(unflagged_and_ends, other = np.nan).where( # first cumul point must be set to 0 - lambda x: x[dim] != lrl[dim][0], other=0 + lambda x: x[dim] != cfd[dim][0], other=0 ).bfill(dim).diff(dim) - lrl.attrs = dict(description="Longest Run Length") - return lrl + return cfd + +def count_days_in_spells(flagged_data, dim, min_spell_length=1): + """ Counts number of days in spells. + + Parameters + ---------- + flagged_data : DataArray + Array of flagged data (0s or 1s) + dim : str + dimension of `flagged_data` along which to count + min_spell_length: int, optional + minimum length of for a spell to be accounted for + + Returns + ------- + DataArray + Array of number of days found in spells along `dim` + + See Also + -------- + _cumsum_flagged_diff + + Examples + -------- + """ + + return _cumsum_flagged_diff(flagged_data, dim).where( + lambda x : x >= min_spell_length + ).sum(dim=dim) + +def length_of_longest_spell(flagged_data, dim): + """ Length of longest spells. + + Parameters + ---------- + flagged_data : DataArray + Array of flagged data (0s or 1s) + dim : str + dimension of `flagged_data` along which to count + + Returns + ------- + DataArray + Array of lengths of longest spells along `dim` + + See Also + -------- + _cumsum_flagged_diff + + Examples + -------- + """ + + return _cumsum_flagged_diff(flagged_data, dim).max(dim=dim) + +def mean_length_of_spells(flagged_data, dim, min_spell_length=1): + """ Mean length of spells. + + Parameters + ---------- + flagged_data : DataArray + Array of flagged data (0s or 1s) + dim : str + dimension of `flagged_data` along which to count + min_spell_length: int, optional + minimum length of for a spell to be accounted for + + Returns + ------- + DataArray + Array of mean length of spells along `dim` + + See Also + -------- + _cumsum_flagged_diff + + Examples + -------- + """ + + return _cumsum_flagged_diff(flagged_data, dim).where( + lambda x : x >= min_spell_length + ).mean(dim=dim) + +def median_length_of_spells(flagged_data, dim, min_spell_length=1): + """ Median length of spells. + + Parameters + ---------- + flagged_data : DataArray + Array of flagged data (0s or 1s) + dim : str + dimension of `flagged_data` along which to count + min_spell_length: int, optional + minimum length of for a spell to be accounted for + + Returns + ------- + DataArray + Array of median length of spells along `dim` + + See Also + -------- + _cumsum_flagged_diff + + Examples + -------- + """ + + return _cumsum_flagged_diff(flagged_data, dim).where( + lambda x : x >= min_spell_length + ).median(dim=dim) diff --git a/pepsico/test/test_app_calc.py b/pepsico/test/test_app_calc.py index 98b52782..a8b312e5 100644 --- a/pepsico/test/test_app_calc.py +++ b/pepsico/test/test_app_calc.py @@ -56,3 +56,40 @@ def test__cumsum_flagged_diff_coord_size_1(): assert spells1 == 1 assert spells0 == 0 + +def test_count_days_in_spells(): + + t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D") + values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0]) + precip = xr.DataArray(values, coords={"T": t}) + days_in_spells = app_calc.count_days_in_spells(precip, "T", min_spell_length=2) + + assert days_in_spells == 5 + +def test_length_of_longest_spell(): + + t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D") + values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0]) + precip = xr.DataArray(values, coords={"T": t}) + longest_spell = app_calc.length_of_longest_spell(precip, "T") + + assert longest_spell == 3 + +def test_mean_length_of_spells(): + + t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D") + values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0]) + precip = xr.DataArray(values, coords={"T": t}) + mean_spell = app_calc.mean_length_of_spells(precip, "T", min_spell_length=2) + + assert mean_spell == 2.5 + +def test_median_length_of_spells(): + + t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D") + values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0]) + precip = xr.DataArray(values, coords={"T": t}) + mendian_spell = app_calc.median_length_of_spells(precip, "T") + + assert mendian_spell == 2 + \ No newline at end of file From 11e8370d33abd5fa180265003549a62c17884a1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Cousin?= Date: Thu, 13 Nov 2025 11:54:12 -0500 Subject: [PATCH 3/4] alternative to _cumsum_flagged_diff --- pepsico/app_calc.py | 24 ++++++++++++++++++++++++ pepsico/test/test_app_calc.py | 17 ++++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/pepsico/app_calc.py b/pepsico/app_calc.py index 51ad1422..8caf7f13 100644 --- a/pepsico/app_calc.py +++ b/pepsico/app_calc.py @@ -552,3 +552,27 @@ def median_length_of_spells(flagged_data, dim, min_spell_length=1): return _cumsum_flagged_diff(flagged_data, dim).where( lambda x : x >= min_spell_length ).median(dim=dim) + + +def _accumulate_spells(flagged_data, axis=0, dtype=None, out=None): + return xr.apply_ufunc( + np.frompyfunc(lambda x, y: 0 if y == 0 else x + y, 2, 1).accumulate, + flagged_data, axis, dtype, out + ) + + +def spells_length(flagged_data, dim): + # cumuls 1s and resets counts when 0 + # then rolls data by 1 to position total cumuls on 0s of the flagged data + # except for the last point that becomes first + spells = _accumulate_spells( + flagged_data, axis=flagged_data.get_axis_num(dim) + ).roll(**{dim: 1}) + # masks out where data falg to rid of accumulating values but last of each spell + spells_only = spells.where(flagged_data == 0) + # resets first value to what it was as it could have been erased in previous step + spells_only[dict(**{dim : 0})] = spells.isel({dim : 0}) + # rolls back to get spells length values on last day of spell + spells = spells_only.roll(**{dim: -1}) + # Turns remaining 0s to NaN + return spells.where(spells > 0) diff --git a/pepsico/test/test_app_calc.py b/pepsico/test/test_app_calc.py index a8b312e5..62b22503 100644 --- a/pepsico/test/test_app_calc.py +++ b/pepsico/test/test_app_calc.py @@ -57,6 +57,22 @@ def test__cumsum_flagged_diff_coord_size_1(): assert spells1 == 1 assert spells0 == 0 + +def test_spells_length(): + + t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D") + values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0]) + precip = xr.DataArray(values, coords={"T": t}) + spells = app_calc.spells_length(precip, "T") + expected = [1, np.nan, np.nan, 2, np.nan, np.nan, np.nan, np.nan, 3, np.nan] + + np.testing.assert_array_equal(spells.isnull(), np.isnan(expected)) + np.testing.assert_array_equal( + spells.where(~spells.isnull(), other=0), + np.where(np.isnan(expected), 0, expected) + ) + + def test_count_days_in_spells(): t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D") @@ -92,4 +108,3 @@ def test_median_length_of_spells(): mendian_spell = app_calc.median_length_of_spells(precip, "T") assert mendian_spell == 2 - \ No newline at end of file From d758567cfcdcfb77e58521463272d933e6ff77f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Cousin?= Date: Fri, 30 Jan 2026 13:45:19 -0500 Subject: [PATCH 4/4] documentation --- pepsico/app_calc.py | 75 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 61 insertions(+), 14 deletions(-) diff --git a/pepsico/app_calc.py b/pepsico/app_calc.py index 8caf7f13..4e670792 100644 --- a/pepsico/app_calc.py +++ b/pepsico/app_calc.py @@ -214,7 +214,7 @@ def daily_tobegroupedby_season( Parameters ----------- - daily_data : DataArray + daily_data : xr.DataArray Daily data to be grouped. start_day : int Day of the start date of the season. @@ -293,7 +293,7 @@ def sel_day_and_month(daily_dim, day, month, offset=0): Parameters ---------- - daily_dim : DataArray[datetime64[ns]] + daily_dim : xr.DataArray[datetime64[ns]] A daily time dimension. day : int day of the `month`. @@ -305,7 +305,7 @@ def sel_day_and_month(daily_dim, day, month, offset=0): Returns ------- - DataArray[datetime64[ns]] + xr.DataArray[datetime64[ns]] a subset of `daily_dim` with all and only `day`-`month` points, offset by `offset` days. See Also @@ -387,14 +387,14 @@ def _cumsum_flagged_diff(flagged_data, dim): Parameters ---------- - flagged_data : DataArray + flagged_data : xr.DataArray Array of flagged data (0s or 1s) dim : str dimension of `flagged_data` along which to search for runs Returns ------- - DataArray + xr.DataArray Array of lengths of spells of flags along `dim` See Also @@ -446,7 +446,7 @@ def count_days_in_spells(flagged_data, dim, min_spell_length=1): Parameters ---------- - flagged_data : DataArray + flagged_data : xr.DataArray Array of flagged data (0s or 1s) dim : str dimension of `flagged_data` along which to count @@ -455,7 +455,7 @@ def count_days_in_spells(flagged_data, dim, min_spell_length=1): Returns ------- - DataArray + xr.DataArray Array of number of days found in spells along `dim` See Also @@ -475,14 +475,14 @@ def length_of_longest_spell(flagged_data, dim): Parameters ---------- - flagged_data : DataArray + flagged_data : xr.DataArray Array of flagged data (0s or 1s) dim : str dimension of `flagged_data` along which to count Returns ------- - DataArray + xr.DataArray Array of lengths of longest spells along `dim` See Also @@ -500,7 +500,7 @@ def mean_length_of_spells(flagged_data, dim, min_spell_length=1): Parameters ---------- - flagged_data : DataArray + flagged_data : xr.DataArray Array of flagged data (0s or 1s) dim : str dimension of `flagged_data` along which to count @@ -509,7 +509,7 @@ def mean_length_of_spells(flagged_data, dim, min_spell_length=1): Returns ------- - DataArray + xr.DataArray Array of mean length of spells along `dim` See Also @@ -529,7 +529,7 @@ def median_length_of_spells(flagged_data, dim, min_spell_length=1): Parameters ---------- - flagged_data : DataArray + flagged_data : xr.DataArray Array of flagged data (0s or 1s) dim : str dimension of `flagged_data` along which to count @@ -538,7 +538,7 @@ def median_length_of_spells(flagged_data, dim, min_spell_length=1): Returns ------- - DataArray + xr.DataArray Array of median length of spells along `dim` See Also @@ -555,6 +555,33 @@ def median_length_of_spells(flagged_data, dim, min_spell_length=1): def _accumulate_spells(flagged_data, axis=0, dtype=None, out=None): + """Accumulates continuous flags. Count reset to 0 when new data is unflagged + + Parameters + ---------- + flagged_data : xr.DataArray + Array of flagged data (0s or 1s) + axis : int, optional + The axis along which to apply the accumulation; default is zero. + dtype : data-type code, optional + The data-type used to represent the intermediate results. Defaults to the + data-type of the output array if such is provided, or the data-type of the + input array if no output array is provided. + out : ndarray, None, or tuple of ndarray and None, optional + Location into which the result is stored. If not provided or None, a + freshly-allocated array is returned. For consistency with ufunc.__call__, if + passed as a keyword argument, can be Ellipses (out=..., which has the same + effect as None as an array is always returned), or a 1-element tuple. + + Returns + ------- + xr.DataArray + Accumulated continuous flags. Count reset to 0 when new data is unflagged. + + See Also + -------- + numpy.ufunc.accumulate, spells_length + """ return xr.apply_ufunc( np.frompyfunc(lambda x, y: 0 if y == 0 else x + y, 2, 1).accumulate, flagged_data, axis, dtype, out @@ -562,13 +589,33 @@ def _accumulate_spells(flagged_data, axis=0, dtype=None, out=None): def spells_length(flagged_data, dim): + """Returns length of spell of flagged data at the last point of the spell, nan + everywhere else + + Parameters + ---------- + flagged_data : xr.DataArray + Array of flagged data (0s or 1s) + dim : str + dimension of `flagged_data` along which to count + + Returns + ------- + xr.DataArray + Length of spell of flagged data at the last point of the spell, nan + everywhere else + + See Also + -------- + _accumulate_spells + """ # cumuls 1s and resets counts when 0 # then rolls data by 1 to position total cumuls on 0s of the flagged data # except for the last point that becomes first spells = _accumulate_spells( flagged_data, axis=flagged_data.get_axis_num(dim) ).roll(**{dim: 1}) - # masks out where data falg to rid of accumulating values but last of each spell + # masks out where data flag to rid of accumulating values but last of each spell spells_only = spells.where(flagged_data == 0) # resets first value to what it was as it could have been erased in previous step spells_only[dict(**{dim : 0})] = spells.isel({dim : 0})