Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
266 changes: 266 additions & 0 deletions enacts/calc.py
Original file line number Diff line number Diff line change
Expand Up @@ -961,6 +961,272 @@ def groupby_dekads(daily_data, time_dim="T"):
return daily_data.groupby_bins(daily_data[time_dim], dekad_edges, right=False)


def resample_interval_to_daily(time_series, is_intensive=None, time_dim="T_bins"):
""" Resample any (interval-based) time series to daily

Parameters
----------
time_series : xr.DataArray or xr.Dataset
data depending on time intervals greater or equal then a day
is_intensive : boolean, optional
indicate the "extensive" or "intensive" property of `time_series` .
Upsampling to daily requires intensive data.
If False, make intensive by dividing by length of intervals in days
Default is None in which case: if units end with "/day", considers intensive,
else, considers extensive
time_dim : str, optional
name of interval time dimenstion, default is "T_bins"

Returns
-------
time_series : xr.DataArray or xr.Dataset
`time_series` resampled to daily

See Also
--------
pandas.Interval, intervals_to_points, replace_intervals_with_points,
xr.DataArray.resample, xr.Dataset.resample

Notes
-----
The day is considered the smallest unit or interval of time in the sense that a
time dimensions expressed as time points is considered equivalent to intervals of
length 1 day. There may be generalization to make to adapt to the actual smallest
time unit of this ecosystem which is the ns.
In thermodynamics (at the core of climate science), quantities can be categorized
as being intensive or extensive to identify how they change when a system changes
in size: 2 systems merging add up (extensive) their mass and volume, but they
don't (intensive) their density (more at
https://en.wikipedia.org/wiki/Intensive_and_extensive_properties).
Closer to what we care about here, temperature is intensive so a monthly value
can be upsampled to daily by assigning same value to all day (implicitely
admitting that the monthly value is a daily average); but precipitation is
extensive so that so a monthly value can not be upsampled to daily by simply
reassigning it (if it rains 300mm in a month, it can rain 300m as well in only
one day of the month -- and 0 all the other days). However, monthly precipitation
expressed in mm/day is intensive.
Extent property could be inferred in more cases, e.g. Kelvin is intensive.
Waiting for pint to figure that all out.
This function differ from xr.DataArray/Dataset.resample in that resample expects
`time_dim` to be datetime-like but it doesn't consider that pd.Interval of
datetime-like is (probably because it reasons in terms of frequency, ignoring
width (intervals)).
"""
if isinstance(time_series[time_dim].values[0], pd._libs.interval.Interval):
# else time_dim is not intervals thus points thus considered daily already
# make daily for computations
if is_intensive is None :
# There are a lot more cases to cover
if "units" in time_series.attrs :
is_intensive = "/day" in time_series.attrs["units"]
else :
is_intensive = False
if not is_intensive : # Can only ffill intensive data
time_series = time_series / [
time_series[time_dim].values[t].length.days
for t in range(time_series[time_dim].size)
]
if "units" in time_series.attrs :
time_series.attrs["units"] = f'{time_series.attrs["units"]}/day'
time_dim_left = ( # There might be other automatic cases to cover
# Same logic as in intervals_to_points
time_dim.replace("_bins", "_left") if time_dim.endswith("_bins")
else "_".join(time_dim, "_left")
)
time_dim_right = (
time_dim.replace("_bins", "_right") if time_dim.endswith("_bins")
else "_".join(time_dim, "right")
)
time_series = xr.concat([
replace_intervals_with_points(time_series, time_dim, to_point="left"),
# Need to cover entirely the last interval
replace_intervals_with_points(
time_series.isel({time_dim : [-1]}), time_dim, to_point="right"
).rename({time_dim_right : time_dim_left}),
], dim=time_dim_left)
time_series = (
time_series.resample({time_dim_left: "1D"}).ffill()
# once filled, can drop the open right point of last interval
.isel({time_dim_left : slice(0, -1)})
)
return time_series


def regroup(time_series, group="1D", time_dim="T"):
""" Regroup any type of interval-based time series to another

Parameters
----------
time_series : xr.DataArray or xr.Dataset
data depending on time intervals greater or equal then a day
group: str, int, or array-like[pandas.DatetimeIndex]
indicates the new type of intervals to regroup to.
As string, must be: nD, pentad, 8day, dekad, 16day, nM, d-d Mmm
or d Mmm - d Mmm. See Notes for details.
As integer or array-like[pandas.DatetimeIndex],
see xr.DataArray.groupby_bins' `bins` Parameter
time_dim : str, optional
name of interval time dimenstion, default is "T"

Returns
-------
regrouped : xr.core.groupby.DataArray/DatasetGroupBy
`time_series` grouped to specified time intervals groups

See Also
--------
resample_interval_to_daily, pandas.DatetimeIndex, xarray.DataArray.groupby_bins

Notes
-----
The day is considered the smallest unit or interval of time (see Note of
resample_interval_to_daily). In this implementation, all `time_series` inputs are
resampled to daily no matter the `group` of interest. It may not be necessary
and more efficient if `group` is coarser than a day (so nearly all cases). The
intersection of `time_series` intervals and `group` intervals would form the
coarsest partition of `time_dim` and weights could be applied depending on
`method` .
The seasonal grouping is of different nature than all other groupings. While all
others make a partition of time, the purpose of the seasonal grouping is to make
a yearly time series of a tailored sesason (e.g. 19 Jan - 29 Mar). However, the
case fits. As of now, the selection of the season of interest is to be done after
applyting `regroup` . It could be incorporated into `regroup` and constitute the
specificity of this case. Or the case could be removed altogether and put in
another function.
Outputs of `regroup` that have a yearly periodicity (ie all except nD, some nM,
int and some array-like[pandas.DatetimeIndex]), could be used to split the time
dimension in 2: years and intervals of the year; which in turn could be reduced
to make climatologies or yearly time series of a given interval.
Known groups:
* nD (e.g. 7D): intervals of n (e.g. 7) days from first day of `time_dim`
* pentad: partition of the year in 5-day intervals. In leap years, Feb 29 is
included in the 25 Feb - 1 Mar interval, making it 6-day long. E.g. at
https://iridl.ldeo.columbia.edu/expert/SOURCES/.NOAA/.NCEP/.CPC/.FEWS/.DAILY/.est_prcp/pentadAverage
* 8day: partition of the year in 8-day intervals. The last interval is used to
adjust the partitioning and is 26/27-31 Dec depending on leap years. E.g. at
https://iridl.ldeo.columbia.edu/SOURCES/.USGS/.LandDAAC/.MODIS/.1km/.8day/.version_006/.Terra/.NY/.Day/.LST
* dekad: partition of the months in 3 10-day intervals, except for the last dekad
of the month that runs until the end of the month (from the 21st -- thus can be
8, 9, 10 or 11 -day long)
* 16day: partition of the year in 16-day intervals. The last interval is used to
adjust the partitioning and is 18/19-31 Dec depending on leap years. E.g. at
https://iridl.ldeo.columbia.edu/SOURCES/.USGS/.LandDAAC/.MODIS/.version_006/.EAF/.NDVI
* nM (e.g. 5D): intervals of n (e.g. 5) months from first full month of `time_dim`
* d-d Mmm or d Mmm - d Mmm (e.g. 21-29 Mar or 19 Jan - 29 Mar): 2 seasons to
partition time against. The 2nd season is the complentary to the one given to
`group` (e.g. 30 Mar - 18 Jan).
* int (e.g. 7): number of equally sized intervals in `time_dim` (see
xr.DataArray.groupby_bins' `bins` for details).
* array-like[pandas.DatetimeIndex]: edges of time intervals (see
xr.DataArray.groupby_bins' `bins` for details).
"""
time_series = resample_interval_to_daily(time_series, time_dim=time_dim)
edges_base = pd.date_range(
# Flooring needed only if time_series already daily
start=time_series[time_dim][0].dt.floor("D").values,
# need one more day since right is excluded
end=(time_series[time_dim][-1] + np.timedelta64(1, "D")).dt.floor("D").values,
freq="1D",
)
if isinstance(group, str) :
# Form bins according to group
if group.endswith("D") :
bins = np.array(
[edges_base[t] for t in range(0, edges_base.size, int(group[:-1]))]
)
elif group == "pentad" :
# 29 Feb always in last pentad of Feb
bins = edges_base.where(
(edges_base.dayofyear % 5) == (
1 - np.array([
pd.Timedelta(
((eb.dayofyear > 59) * (not eb.is_leap_year)), "D"
).days for eb in edges_base
])
)
).dropna()
elif group == "8day" :
# last period of year used to adjusting
bins = edges_base.where((edges_base.dayofyear % 8) == 1).dropna()
elif group == "dekad" :
bins = edges_base.where(
(edges_base.day == 1)
| (edges_base.day == 11)
| (edges_base.day == 21)
).dropna()
elif group == "16day" :
# last period of year used to adjusting
bins = edges_base.where((edges_base.dayofyear % 16) == 1).dropna()
elif group.endswith("M") :
bins = edges_base.where(edges_base.day == 1).dropna()
bins = np.array([bins[t] for t in range(0, bins.size, int(group[:-1]))])
elif "-" in group :
# e.g. "29 Feb - 30 Mar" or "2-29 Mar"
# This case usage is to keep only the season of interest given as input.
# Thus not to form a partition of time as in other cases and could be
# moved to another function. Or if kept could include the selection of
# said season of interest. (Or just leave as is).
if " - " in group :
start_day = group.split()[0]
start_month = group.split()[1]
end_day = group.split()[3]
end_month = group.split()[4]
else:
start_day = group.split()[0].split("-")[0]
end_day = group.split()[0].split("-")[1]
start_month = group.split()[1]
end_month = start_month
start_day = int(start_day)
end_day = int(end_day)
if start_day == 29 and start_month == "Feb" :
# don't allow start on 29 Feb: this is pushy
start_day = 1
start_month = "Mar"
offset = 1
if end_day == 29 and end_month == "Feb" :
end_day = 1
end_month = "Mar"
offset = 0
bins = edges_base.where(
(
(edges_base.day == start_day)
& [
(edges_base.month_name()[t][:3] == start_month)
for t in range(edges_base.size)
]
)
| ( # group end is inclusive
((edges_base - pd.Timedelta(offset, "D")).day == end_day)
& [
((
edges_base - pd.Timedelta(offset, "D")
).month_name()[t][:3] == end_month)
for t in range(edges_base.size)
]
)
).dropna()
else:
raise Exception(
f"group as str must be nD, pentad, 8day, dekad, 16day, nM, d-d Mmm"
f" or d Mmm - d Mmm"
)
elif isinstance(group, int) :
bins = group
elif insintance(group, pandas.core.indexes.datetimes.DatetimeIndex):
# custom bins edges from input
bins = group
else :
raise Exception(
f"group must be int, array, or str of form nD, pentad, 8day, dekad,"
f" 16day, nM,d-d Mmm or d Mmm - d Mmm"
)
if (not isinstance(group, int)):
assert (bins.size > 1), (
"data must span at least one full group (need 2 edges to form 1 bin)"
)
return time_series.groupby_bins(time_series[time_dim], bins, right=False)


def strftimeb2int(strftimeb):
"""Convert month values to integers (1-12) from strings.

Expand Down
Loading
Loading