Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
254 changes: 251 additions & 3 deletions pepsico/app_calc.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def daily_tobegroupedby_season(

Parameters
-----------
daily_data : DataArray
daily_data : xr.DataArray
Daily data to be grouped.
start_day : int
Day of the start date of the season.
Expand Down Expand Up @@ -293,7 +293,7 @@ def sel_day_and_month(daily_dim, day, month, offset=0):

Parameters
----------
daily_dim : DataArray[datetime64[ns]]
daily_dim : xr.DataArray[datetime64[ns]]
A daily time dimension.
day : int
day of the `month`.
Expand All @@ -305,7 +305,7 @@ def sel_day_and_month(daily_dim, day, month, offset=0):

Returns
-------
DataArray[datetime64[ns]]
xr.DataArray[datetime64[ns]]
a subset of `daily_dim` with all and only `day`-`month` points, offset by `offset` days.

See Also
Expand Down Expand Up @@ -375,3 +375,251 @@ def strftimeb2int(strftimeb):
}
strftimebint = strftimeb_all[strftimeb]
return strftimebint


def _cumsum_flagged_diff(flagged_data, dim):
""" Discrete difference of cumulative flags keeping only the unflagged data

Last 0s before a 1 return the count of consecutive 1s after that 0, except for
the first value that will return 0 if 0, and the number of consecutive 1s if 1,
and the last value that is dropped.
The result is expressed in the units of `dim`, that is assumed evenly spaced.

Parameters
----------
flagged_data : xr.DataArray
Array of flagged data (0s or 1s)
dim : str
dimension of `flagged_data` along which to search for runs

Returns
-------
xr.DataArray
Array of lengths of spells of flags along `dim`

See Also
--------
count_days_in_spell, length_of_longest_spell, mean_length_of_spell,
median_length_of_spell

Notes
-----
Because diff needs at least 2 points,
we need to keep (where) the unflagged and first and last
with the cumulative value for last and 0 for first.
Cumulative flags, where kept, need be propagated by bfill
so that diff returns 0 or the length of runs.

I believe that it works for unevenly spaced `dim`,
only we then don't know what the units of the result are.

Is meant to be further reduced to for instance find the longest spell, or the
numbers of days in spells, etc.

Examples
--------
>>> t = pd.date_range(start="2000-05-01", end="2000-05-29", freq="1D")
>>> values = [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
... 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0]
>>> flags = xr.DataArray(values, dims=["T"], coords={"T": t})
>>> _cumsum_flagged_diff(flags, "T")
<xarray.DataArray ()>
array(0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0,
... 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 2, 0, 0, 0)
"""

# Special case coord.size = 1
cfd = flagged_data
if cfd[dim].size != 1:
# Points to apply diff to
unflagged_and_ends = (flagged_data == 0) * 1
unflagged_and_ends[{dim: [0, -1]}] = 1

cfd = cfd.cumsum(dim=dim).where(unflagged_and_ends, other = np.nan).where(
# first cumul point must be set to 0
lambda x: x[dim] != cfd[dim][0], other=0
).bfill(dim).diff(dim)
return cfd

def count_days_in_spells(flagged_data, dim, min_spell_length=1):
""" Counts number of days in spells.

Parameters
----------
flagged_data : xr.DataArray
Array of flagged data (0s or 1s)
dim : str
dimension of `flagged_data` along which to count
min_spell_length: int, optional
minimum length of for a spell to be accounted for

Returns
-------
xr.DataArray
Array of number of days found in spells along `dim`

See Also
--------
_cumsum_flagged_diff

Examples
--------
"""

return _cumsum_flagged_diff(flagged_data, dim).where(
lambda x : x >= min_spell_length
).sum(dim=dim)

def length_of_longest_spell(flagged_data, dim):
""" Length of longest spells.

Parameters
----------
flagged_data : xr.DataArray
Array of flagged data (0s or 1s)
dim : str
dimension of `flagged_data` along which to count

Returns
-------
xr.DataArray
Array of lengths of longest spells along `dim`

See Also
--------
_cumsum_flagged_diff

Examples
--------
"""

return _cumsum_flagged_diff(flagged_data, dim).max(dim=dim)

def mean_length_of_spells(flagged_data, dim, min_spell_length=1):
""" Mean length of spells.

Parameters
----------
flagged_data : xr.DataArray
Array of flagged data (0s or 1s)
dim : str
dimension of `flagged_data` along which to count
min_spell_length: int, optional
minimum length of for a spell to be accounted for

Returns
-------
xr.DataArray
Array of mean length of spells along `dim`

See Also
--------
_cumsum_flagged_diff

Examples
--------
"""

return _cumsum_flagged_diff(flagged_data, dim).where(
lambda x : x >= min_spell_length
).mean(dim=dim)

def median_length_of_spells(flagged_data, dim, min_spell_length=1):
""" Median length of spells.

Parameters
----------
flagged_data : xr.DataArray
Array of flagged data (0s or 1s)
dim : str
dimension of `flagged_data` along which to count
min_spell_length: int, optional
minimum length of for a spell to be accounted for

Returns
-------
xr.DataArray
Array of median length of spells along `dim`

See Also
--------
_cumsum_flagged_diff

Examples
--------
"""

return _cumsum_flagged_diff(flagged_data, dim).where(
lambda x : x >= min_spell_length
).median(dim=dim)


def _accumulate_spells(flagged_data, axis=0, dtype=None, out=None):
"""Accumulates continuous flags. Count reset to 0 when new data is unflagged

Parameters
----------
flagged_data : xr.DataArray
Array of flagged data (0s or 1s)
axis : int, optional
The axis along which to apply the accumulation; default is zero.
dtype : data-type code, optional
The data-type used to represent the intermediate results. Defaults to the
data-type of the output array if such is provided, or the data-type of the
input array if no output array is provided.
out : ndarray, None, or tuple of ndarray and None, optional
Location into which the result is stored. If not provided or None, a
freshly-allocated array is returned. For consistency with ufunc.__call__, if
passed as a keyword argument, can be Ellipses (out=..., which has the same
effect as None as an array is always returned), or a 1-element tuple.

Returns
-------
xr.DataArray
Accumulated continuous flags. Count reset to 0 when new data is unflagged.

See Also
--------
numpy.ufunc.accumulate, spells_length
"""
return xr.apply_ufunc(
np.frompyfunc(lambda x, y: 0 if y == 0 else x + y, 2, 1).accumulate,
flagged_data, axis, dtype, out
)


def spells_length(flagged_data, dim):
"""Returns length of spell of flagged data at the last point of the spell, nan
everywhere else

Parameters
----------
flagged_data : xr.DataArray
Array of flagged data (0s or 1s)
dim : str
dimension of `flagged_data` along which to count

Returns
-------
xr.DataArray
Length of spell of flagged data at the last point of the spell, nan
everywhere else

See Also
--------
_accumulate_spells
"""
# cumuls 1s and resets counts when 0
# then rolls data by 1 to position total cumuls on 0s of the flagged data
# except for the last point that becomes first
spells = _accumulate_spells(
flagged_data, axis=flagged_data.get_axis_num(dim)
).roll(**{dim: 1})
# masks out where data flag to rid of accumulating values but last of each spell
spells_only = spells.where(flagged_data == 0)
# resets first value to what it was as it could have been erased in previous step
spells_only[dict(**{dim : 0})] = spells.isel({dim : 0})
# rolls back to get spells length values on last day of spell
spells = spells_only.roll(**{dim: -1})
# Turns remaining 0s to NaN
return spells.where(spells > 0)
110 changes: 110 additions & 0 deletions pepsico/test/test_app_calc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import numpy as np
import pandas as pd
import xarray as xr
import app_calc


def test__cumsum_flagged_diff():

t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0])
precip = xr.DataArray(values, coords={"T": t})
spells = app_calc._cumsum_flagged_diff(precip, "T")

np.testing.assert_array_equal(spells, [1, 2, 0, 0, 0, 3, 0, 0, 0])


def test__cumsum_flagged_diff_where_last_is_flagged():

t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
values = np.array([1, 0, 1, 1, 0, 1, 1, 1, 1, 1])
precip = xr.DataArray(values, coords={"T": t})
spells = app_calc._cumsum_flagged_diff(precip, "T")

np.testing.assert_array_equal(spells, [1, 2, 0, 0, 5, 0, 0, 0, 0])


def test__cumsum_flagged_diff_all_flagged():

t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
values = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
precip = xr.DataArray(values, coords={"T": t})
spells = app_calc._cumsum_flagged_diff(precip, "T")

np.testing.assert_array_equal(spells, [10, 0, 0, 0, 0, 0, 0, 0, 0])


def test__cumsum_flagged_diff_all_unflagged():

t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
values = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
precip = xr.DataArray(values, coords={"T": t})
spells = app_calc._cumsum_flagged_diff(precip, "T")

np.testing.assert_array_equal(spells, [0, 0, 0, 0, 0, 0, 0, 0, 0])


def test__cumsum_flagged_diff_coord_size_1():

t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
values = np.array([1, 0, 1, 1, 0, 1, 1, 1, 0, 0])
precip = xr.DataArray(values, coords={"T": t})
precip_0_1 = precip[0]
precip_0_0 = precip_0_1 - 1
spells1 = app_calc._cumsum_flagged_diff(precip_0_1, "T")
spells0 = app_calc._cumsum_flagged_diff(precip_0_0, "T")

assert spells1 == 1
assert spells0 == 0


def test_spells_length():

t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0])
precip = xr.DataArray(values, coords={"T": t})
spells = app_calc.spells_length(precip, "T")
expected = [1, np.nan, np.nan, 2, np.nan, np.nan, np.nan, np.nan, 3, np.nan]

np.testing.assert_array_equal(spells.isnull(), np.isnan(expected))
np.testing.assert_array_equal(
spells.where(~spells.isnull(), other=0),
np.where(np.isnan(expected), 0, expected)
)


def test_count_days_in_spells():

t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0])
precip = xr.DataArray(values, coords={"T": t})
days_in_spells = app_calc.count_days_in_spells(precip, "T", min_spell_length=2)

assert days_in_spells == 5

def test_length_of_longest_spell():

t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0])
precip = xr.DataArray(values, coords={"T": t})
longest_spell = app_calc.length_of_longest_spell(precip, "T")

assert longest_spell == 3

def test_mean_length_of_spells():

t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0])
precip = xr.DataArray(values, coords={"T": t})
mean_spell = app_calc.mean_length_of_spells(precip, "T", min_spell_length=2)

assert mean_spell == 2.5

def test_median_length_of_spells():

t = pd.date_range(start="2000-05-01", end="2000-05-10", freq="1D")
values = np.array([1, 0, 1, 1, 0, 0, 1, 1, 1, 0])
precip = xr.DataArray(values, coords={"T": t})
mendian_spell = app_calc.median_length_of_spells(precip, "T")

assert mendian_spell == 2
Loading