From 7dac882d9b75fd4f6973faec98fea588325cbedd Mon Sep 17 00:00:00 2001 From: Konstantin Ramthun Date: Sun, 7 Sep 2025 17:00:27 +0200 Subject: [PATCH 1/4] feat!: Added frequency aware one-hot and relative cyclic encoding. --- .../tests/utils/test_timeseries_generation.py | 84 ++++ darts/utils/timeseries_generation.py | 435 +++++++++++++----- 2 files changed, 416 insertions(+), 103 deletions(-) diff --git a/darts/tests/utils/test_timeseries_generation.py b/darts/tests/utils/test_timeseries_generation.py index 7099a47e20..56e3ba35ad 100644 --- a/darts/tests/utils/test_timeseries_generation.py +++ b/darts/tests/utils/test_timeseries_generation.py @@ -4,6 +4,7 @@ import numpy as np import pandas as pd import pytest +from pandas.tseries.frequencies import to_offset from darts import TimeSeries from darts.utils.timeseries_generation import ( @@ -18,6 +19,7 @@ linear_timeseries, random_walk_timeseries, sine_timeseries, + unique_datetime_value_freq_aware, ) from darts.utils.utils import freqs @@ -385,6 +387,88 @@ def test_datetime_attribute_timeseries_wrong_args(self): ) assert "`time_index` must be time zone naive." == str(err.value) + @pytest.mark.parametrize( + "attribute,freq,start,expected", + [ + pytest.param( + "minute", + to_offset("1min"), + pd.Timestamp(year=2000, month=1, day=1), + np.arange(60), + id="minute_minutely", + ), + pytest.param( + "minute", + to_offset("1min"), + pd.Timestamp(year=2000, month=1, day=1, minute=1), + np.arange(60), + id="minute_minutely_one_minute_shifted", + ), + pytest.param( + "minute", + to_offset("1h"), + pd.Timestamp(year=2000, month=1, day=1), + np.arange(1), + id="minute_hourly", + ), + pytest.param( + "minute", + to_offset("15min"), + pd.Timestamp(year=2000, month=1, day=1), + np.array([0, 15, 30, 45]), + id="minute_quarter_hourly", + ), + pytest.param( + "day", + to_offset("1D"), + pd.Timestamp(year=2025, month=1, day=1), + np.arange(31), + id="day_daily_january", + ), + pytest.param( + "day", + to_offset("1D"), + pd.Timestamp(year=2025, month=2, day=1), + np.arange(31), + id="day_daily_february", + ), + pytest.param( + "day_of_week", + to_offset("YS"), + pd.Timestamp(year=2025, month=1, day=1), + np.arange(7), + id="dayofweek_yearly", + ), + pytest.param( + "day", + to_offset("YS"), + pd.Timestamp(year=2025, month=1, day=1), + np.arange(1), + id="day_yearly", + ), + pytest.param( + "day", + to_offset("B"), + pd.Timestamp(year=2025, month=1, day=1), + ValueError, + id="business_day_value_error", + ), + ], + ) + def test_unique_datetime_value_freq_aware( + self, + attribute: str, + freq: pd.DateOffset, + start: pd.Timestamp, + expected: np.ndarray[int] | type[Exception], + ): + if isinstance(expected, type) and issubclass(expected, Exception): + with pytest.raises(expected): + unique_datetime_value_freq_aware(attribute, freq, start) + else: + unique_values = unique_datetime_value_freq_aware(attribute, freq, start) + np.testing.assert_array_equal(unique_values, expected) + def test_datetime_attribute_timeseries(self): idx = generate_index( start=pd.Timestamp("2000-01-01"), length=48, freq=freqs["h"] diff --git a/darts/utils/timeseries_generation.py b/darts/utils/timeseries_generation.py index 8ee73a75aa..b92194ae77 100644 --- a/darts/utils/timeseries_generation.py +++ b/darts/utils/timeseries_generation.py @@ -3,6 +3,7 @@ ------------------------------- """ +import math from collections.abc import Sequence from typing import Any, Callable, Optional, Union @@ -34,6 +35,54 @@ "week_of_year", } TIMES_NAME = DIMS[TIME_AX] +MAX_DATETIME_VALUES = { + "month": 12, + "day": 31, + "weekday": 7, + "dayofweek": 7, + "day_of_week": 7, + "hour": 24, + "minute": 60, + "second": 60, + "microsecond": 1000000, + "nanosecond": 1000, + "quarter": 4, + # leap years insert an additional day on the 29th of February + "dayofyear": 365 + 1, + "day_of_year": 365 + 1, + # years contain an additional week if they are : + # - a regular year starting on a thursday + # - a leap year starting on a wednesday + "week": 52 + 1, + "weekofyear": 52 + 1, + "week_of_year": 52 + 1, +} +PERIOD_BY_ATTRIBTUE = { + "month": pd.Timedelta(days=366), + "day": pd.Timedelta(days=31), + "weekday": pd.Timedelta(days=7), + "dayofweek": pd.Timedelta(days=7), + "day_of_week": pd.Timedelta(days=7), + "hour": pd.Timedelta(hours=24), + "minute": pd.Timedelta(minutes=60), + "second": pd.Timedelta(seconds=60), + "microsecond": pd.Timedelta(microseconds=1000000), + "nanosecond": pd.Timedelta(nanoseconds=1000), + "quarter": pd.Timedelta(days=366), # approx + "dayofyear": pd.Timedelta(days=366), + "day_of_year": pd.Timedelta(days=366), + "week": pd.Timedelta(weeks=53), + "weekofyear": pd.Timedelta(weeks=53), + "week_of_year": pd.Timedelta(weeks=53), +} +DATETIME_ATT_WITH_VARIABLE_MAX = [ + "day", + "dayofyear", + "day_of_year", + "week", + "weekofyear", + "week_of_year", +] def constant_timeseries( @@ -571,11 +620,221 @@ def holidays_timeseries( ) +def _get_datetime_attribute_values( + attribute: str, time_index: pd.DatetimeIndex +) -> pd.Index: + if attribute not in ["week", "weekofyear", "week_of_year"]: + values = getattr(time_index, attribute) + else: + values = ( + time_index.isocalendar() + .set_index("week") + .index.astype("int64") + .rename("time") + ) + # shift 1-indexed datetime attributes + if attribute in ONE_INDEXED_FREQS: + values -= 1 + return values + + +def _timedelta_lcm(td1: pd.Timedelta, td2: pd.Timedelta) -> pd.Timedelta: + """Returns the least common multiple (LCM) of two pandas Timedelta objects. + + Raises a ValueError if no meaningful LCM exists (e.g., for zero or non-integer nanosecond values). + + Parameters + ---------- + td1 + The first Timedelta. + td2 + The second Timedelta. + + Returns + ------- + pd.Timedelta + The LCM of the two Timedelta objects. + + Raises + ------ + ValueError + If no meaningful LCM exists (e.g., for zero or non-integer nanosecond values). + """ + ns1 = td1.value + ns2 = td2.value + + # Check for zero timedelta + if ns1 == 0 or ns2 == 0: + raise ValueError("Timedelta values must be non-zero.") + + # Check for integer nanosecond representation + if not isinstance(ns1, int) or not isinstance(ns2, int): + raise ValueError("Timedelta values must be integer nanoseconds.") + + gcd = math.gcd(ns1, ns2) + if gcd == 0: + raise ValueError("No meaningful LCM possible (GCD is zero).") + + lcm_ns = abs(ns1 * ns2) // gcd + + # Check if LCM is a multiple of both inputs + if lcm_ns % ns1 != 0 or lcm_ns % ns2 != 0: + raise ValueError("No integer LCM exists for these Timedelta values.") + + return pd.Timedelta(lcm_ns, unit="ns") + + +def unique_datetime_value_freq_aware( + attribute: str, freq: pd.tseries.offsets.BaseOffset, start: pd.Timestamp +) -> np.ndarray[int]: + """Returns a sorted array of unqiue values that the given datetime attribute can take, based on `freq` and `start`. + + Parameters + ---------- + attribute + An attribute of `pd.DatetimeIndex`, or `week` / `weekofyear` / `week_of_year` - e.g. "month", "weekday", "day", + "hour", "minute", "second". See all available attributes in + https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DatetimeIndex.html#pandas.DatetimeIndex. + freq + The frequency of the time index. + start + The start of the time index. + + Returns + ------- + np.ndarray[int] + Sorted array of all the unique values that the given datetime attribute can take. + + See Also + -------- + unique_datetime_values: When all possible values for the attribute are to be returned. + + Warnings + -------- + For attributes with a variable number of maximum values (day, dayofyear, day_of_year, week, weekofyear, + week_of_year), this function will return all possible values as fallback, since actually computing the values + would be inefficient. + + Examples + -------- + >>> from darts.utils.timeseries_generation import unique_datetime_values + >>> from pandas.tseries.frequencies import to_offset + >>> unique_datetime_values("hour", "15min", pd.Timestamp("2020-01-01")) + array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23]) + >>> unique_datetime_values("minute", "15min", pd.Timestamp("2020-01-01")) + array([0, 15, 30, 45]) + """ + raise_if_not( + attribute in MAX_DATETIME_VALUES, + f"Can't determine unique values for attribute `{attribute}`, required for cyclic and one-hot encodings. " + f"Supported datetime attribute: {list(MAX_DATETIME_VALUES.keys())}", + logger, + ) + # Common frequencies, which are not convertable to pd.Timedelta + fixed_yearly = { + "month", + "day", + "hour", + "minute", + "second", + "microsecond", + "nanosecond", + "quarter", + } + fixed_monthly = {"day", "hour", "minute", "second", "microsecond", "nanosecond"} + fixed_attributes = { + pd.tseries.offsets.YearBegin: fixed_yearly, + pd.tseries.offsets.YearEnd: fixed_yearly, + pd.tseries.offsets.MonthBegin: fixed_monthly, + pd.tseries.offsets.MonthEnd: fixed_monthly, + } + if type(freq) in fixed_attributes: + if attribute in fixed_attributes[type(freq)]: + val = np.array([getattr(start, attribute)]) + if attribute in ONE_INDEXED_FREQS: + val -= 1 + return val + else: + return unique_datetime_values(attribute) + # Handle other frequencies + freq_delta = None + try: + freq_delta = pd.Timedelta(freq.freqstr) + except ValueError as e: + if e.args and "unit abbreviation w/o a number" in e.args[0]: + try: + freq_delta = pd.Timedelta(1, unit=freq.freqstr) + except ValueError: + pass + finally: + if freq_delta is None: + raise_log( + ValueError( + f"Can't convert freq `{freq.freqstr}` to pd.Timedelta, required for computing unique values for " + f"attribute `{attribute}`. Please provide a frequency that can be converted to pd.Timedelta, " + f"e.g. '15min', '1H', '3D', '1W'. Alternatively, use a frequency unaware encoding or omit the " + "attribute." + ), + logger, + ) + if attribute in DATETIME_ATT_WITH_VARIABLE_MAX: + # For these attributes, periods must be really long to capture all possible values + # + logger.warning( + "Finding unique values for attribute `%s` based on frequency uses all possible values as fallback.", + attribute, + ) + return unique_datetime_values(attribute) + lcm = _timedelta_lcm(freq_delta, PERIOD_BY_ATTRIBTUE[attribute]) + num_unique = lcm // freq_delta + idx = pd.date_range(start=start, freq=freq_delta, periods=num_unique) + values: pd.Index = _get_datetime_attribute_values(attribute, idx).sort_values() + return values.unique().to_numpy() + + +def unique_datetime_values(attribute: str) -> np.ndarray[int]: + """Returns a sorted array of all the unique values that the given datetime attribute can take. + + Parameters + ---------- + attribute + An attribute of `pd.DatetimeIndex`, or `week` / `weekofyear` / `week_of_year` - e.g. "month", "weekday", "day", + "hour", "minute", "second". See all available attributes in + https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DatetimeIndex.html#pandas.DatetimeIndex. + + Returns + ------- + np.ndarray[int] + Sorted array of all the unique values that the given datetime attribute can take. + + See Also + -------- + unique_datetime_value_freq_aware: When the unique values are to be determined based on `freq` and `start`. + + Examples + -------- + >>> from darts.utils.timeseries_generation import unique_datetime_values + >>> from pandas.tseries.frequencies import to_offset + >>> unique_datetime_values("month") + array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]) + """ + raise_if_not( + attribute in MAX_DATETIME_VALUES, + f"Can't determine unique values for attribute `{attribute}`, required for cyclic and one-hot encodings. " + f"Supported datetime attribute: {list(MAX_DATETIME_VALUES.keys())}", + logger, + ) + return np.arange(MAX_DATETIME_VALUES[attribute]) + + def datetime_attribute_timeseries( time_index: Union[pd.DatetimeIndex, TimeSeries], attribute: str, one_hot: bool = False, + one_hot_freq_aware: bool = False, cyclic: bool = False, + cyclic_relative: bool = False, until: Optional[Union[int, str, pd.Timestamp]] = None, add_length: int = 0, dtype=np.float64, @@ -600,10 +859,19 @@ def datetime_attribute_timeseries( one_hot Boolean value indicating whether to add the specified attribute as a one hot encoding (results in more columns). + one_hot_freq_aware + Boolean value that controls the behavior of one-hot encoding. If `True`, the one-hot encoding includes only the + actual possible values based on the frequency and start of the time index. If `False`, the encoding includes + all possibl values for the attribute. cyclic Boolean value indicating whether to add the specified attribute as a cyclic encoding. Alternative to one_hot encoding, enable only one of the two. (adds 2 columns, corresponding to sin and cos transformation) + cyclic_relative + Boolean value controlling the behavior of cyclic encoding for attributes with a variable maximum value + (e.g., `day`, `dayofyear`, `week`). If `True`, the cyclic encoding uses the relative period based on the + actual number of days or weeks in the current month or year. If `False`, the encoding uses the absolute + maximum possible value for the attribute (e.g., 31 for days, 366 for days in a year, 53 for weeks). until Extend the time_index up until timestamp for datetime indexed series and int for range indexed series, should match or exceed forecasting window. @@ -647,107 +915,76 @@ def datetime_attribute_timeseries( raise_if(one_hot and cyclic, "set only one of one_hot or cyclic to true", logger) - num_values_dict = { - "month": 12, - "day": 31, - "weekday": 7, - "dayofweek": 7, - "day_of_week": 7, - "hour": 24, - "minute": 60, - "second": 60, - "microsecond": 1000000, - "nanosecond": 1000, - "quarter": 4, - "dayofyear": 365, - "day_of_year": 365, - "week": 52, - "weekofyear": 52, - "week_of_year": 52, - } - - if attribute not in ["week", "weekofyear", "week_of_year"]: - values = getattr(time_index, attribute) - else: - values = ( - time_index.isocalendar() - .set_index("week") - .index.astype("int64") - .rename("time") - ) - - # shift 1-indexed datetime attributes - if attribute in ONE_INDEXED_FREQS: - values -= 1 - - # leap years insert an additional day on the 29th of February - if attribute in {"dayofyear", "day_of_year"} and any(time_index.is_leap_year): - num_values_dict[attribute] += 1 - - # years contain an additional week if they are : - # - a regular year starting on a thursday - # - a leap year starting on a wednesday - if attribute in {"week", "weekofyear", "week_of_year"}: - years = time_index.year.unique() - # check if year respect properties - additional_week_year = any( - ((not first_day.is_leap_year) and first_day.day_name() == "Thursday") - or (first_day.is_leap_year and first_day.day_name() == "Wednesday") - for first_day in [pd.Timestamp(f"{year}-01-01") for year in years] - ) - # check if time index actually include the additional week - additional_week_in_index = time_index[-1] - time_index[0] + pd.Timedelta( - days=1 - ) >= pd.Timedelta(days=365) - - if additional_week_year and additional_week_in_index: - num_values_dict[attribute] += 1 - - if one_hot or cyclic: - raise_if_not( - attribute in num_values_dict, - f"Given datetime attribute `{attribute}` not supported with one-hot or cyclical encoding. " - f"Supported datetime attribute: {list(num_values_dict.keys())}", - logger, - ) - - if one_hot: - values_df = pd.get_dummies(values) - # fill missing columns (in case not all values appear in time_index) - attribute_range = np.arange(num_values_dict[attribute]) - is_missing = np.isin(attribute_range, values_df.columns.values, invert=True) - # if there are attribute_range columns that are - # not in values_df.columns.values - if is_missing.any(): - dict_0 = {i: False for i in attribute_range[is_missing]} - # Make a dataframe from the dictionary and concatenate it - # to the values values_df in which the existing columns - values_df = pd.concat( - [values_df, pd.DataFrame(dict_0, index=values_df.index)], axis=1 - ).sort_index(axis=1) - else: - values_df = values_df[attribute_range] - + values = _get_datetime_attribute_values(attribute, time_index) + if not one_hot and not cyclic: if with_columns is None: - with_columns = [ - attribute + "_" + str(column_name) for column_name in values_df.columns - ] - + with_columns = attribute raise_if_not( - len(with_columns) == len(values_df.columns), - "For the given case with `one_hot=True`,`with_columns` must be a list of strings of length " - f"{values_df.columns}.", + isinstance(with_columns, str), + "`with_columns` must be a string specifying the output component name.", logger=logger, ) - - values_df.columns = with_columns + values_df = pd.DataFrame({with_columns: values}) else: - if cyclic: - if attribute == "day": - periods = time_index.days_in_month.values + if one_hot: + if one_hot_freq_aware: + unique_values = unique_datetime_value_freq_aware( + attribute, time_index.freqstr, time_index[0] + ) + else: + unique_values = unique_datetime_values(attribute) + values_df = pd.get_dummies(values) + # fill missing columns (in case not all values appear in time_index) + is_missing = np.isin(unique_values, values_df.columns.values, invert=True) + # if there are attribute_range columns that are + # not in values_df.columns.values + if is_missing.any(): + dict_0 = {i: False for i in unique_values[is_missing]} + # Make a dataframe from the dictionary and concatenate it + # to the values values_df in which the existing columns + values_df = pd.concat( + [values_df, pd.DataFrame(dict_0, index=values_df.index)], axis=1 + ).sort_index(axis=1) + else: + values_df = values_df[unique_values] + + if with_columns is None: + with_columns = [ + attribute + "_" + str(column_name) + for column_name in values_df.columns + ] + else: + raise_if_not( + len(with_columns) == len(values_df.columns), + ( + f"For the given case with `one_hot=True` and `one_hot_freq_aware={one_hot_freq_aware}`, " + f"`with_columns` must be a list of strings of length {values_df.columns}." + ), + logger=logger, + ) + + values_df.columns = with_columns + else: + unique_values = unique_datetime_values(attribute) + if attribute in DATETIME_ATT_WITH_VARIABLE_MAX and cyclic_relative: + if attribute == "day": + periods = time_index.days_in_month.values + elif attribute in ("dayofyear", "day_of_year"): + periods = np.where(time_index.is_leap_year, 366, 365) + elif attribute in ("week", "weekofyear", "week_of_year"): + periods = np.where( + (time_index.is_year_start & (time_index.weekday == 3)) + | ( + time_index.is_leap_year + & time_index.is_year_start + & (time_index.weekday == 2) + ), + 53, + 52, + ) freq = 2 * np.pi * np.reciprocal(periods.astype(dtype)) else: - period = num_values_dict[attribute] + period = unique_values.max() + 1 freq = 2 * np.pi / period if with_columns is None: @@ -763,15 +1000,7 @@ def datetime_attribute_timeseries( with_columns[0]: np.sin(freq * values), with_columns[1]: np.cos(freq * values), }) - else: - if with_columns is None: - with_columns = attribute - raise_if_not( - isinstance(with_columns, str), - "`with_columns` must be a string specifying the output component name.", - logger=logger, - ) - values_df = pd.DataFrame({with_columns: values}) + return TimeSeries( times=time_index_ts, values=values_df.values.astype(dtype), From 0d28e3d90638e858b146f8e460f7e96727f35c19 Mon Sep 17 00:00:00 2001 From: Konstantin Ramthun Date: Sun, 7 Sep 2025 18:22:37 +0200 Subject: [PATCH 2/4] test: Changed tests for datetime_attribute_timeseries to reflect the new default behavior. --- .../tests/utils/test_timeseries_generation.py | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/darts/tests/utils/test_timeseries_generation.py b/darts/tests/utils/test_timeseries_generation.py index 56e3ba35ad..1f627dd7b1 100644 --- a/darts/tests/utils/test_timeseries_generation.py +++ b/darts/tests/utils/test_timeseries_generation.py @@ -8,6 +8,7 @@ from darts import TimeSeries from darts.utils.timeseries_generation import ( + DATETIME_ATT_WITH_VARIABLE_MAX, ONE_INDEXED_FREQS, _build_forecast_series_from_schema, autoregressive_timeseries, @@ -508,8 +509,8 @@ def test_datetime_attribute_timeseries(self): (freqs["h"], "hour", 24), ("D", "weekday", 7), (freqs["s"], "second", 60), - ("W", "weekofyear", 52), - ("D", "dayofyear", 365), + ("W", "weekofyear", 53), + ("D", "dayofyear", 366), (freqs["QE"], "quarter", 4), ], ) @@ -563,8 +564,10 @@ def test_datetime_attribute_timeseries_one_hot(self, config): # first quarter/year, month/year, week/year, day/year, day/week, hour/day, second/hour simple_start = pd.Timestamp("2001-01-01 00:00:00") idx = generate_index(start=simple_start, length=period, freq=base_freq) - vals = np.eye(period) - + expected_dim = period + if attribute_freq in DATETIME_ATT_WITH_VARIABLE_MAX: + expected_dim += 1 + vals = np.eye(period, expected_dim) # simple start self.helper_routine(idx, attribute_freq, vals_exp=vals, one_hot=True) # with time-zone @@ -576,7 +579,7 @@ def test_datetime_attribute_timeseries_one_hot(self, config): # missing values cut_period = period // 3 idx = generate_index(start=simple_start, length=cut_period, freq=base_freq) - vals = np.eye(period) + vals = np.eye(period, expected_dim) # removing missing rows vals = vals[:cut_period] # mask missing attribute values @@ -603,7 +606,7 @@ def test_datetime_attribute_timeseries_one_hot(self, config): shift -= 1 idx = generate_index(start=shifted_start, length=period, freq=base_freq) - vals = np.eye(period) + vals = np.eye(period, expected_dim) # shift values vals = np.roll(vals, shift=-shift, axis=0) @@ -701,9 +704,11 @@ def test_datetime_attribute_timeseries_special_years(self, year): # the 53th week is omitted from index when created with freq="W" index_weeks = pd.date_range(start=start_date, end=end_date, freq="W") assert len(index_weeks) == weeks_special_year - 1 - # and 53th week properly excluded from the encoding - vals_exp = np.eye(weeks_special_year - 1)[: len(index_weeks)] - assert vals_exp.shape[1] == weeks_special_year - 1 + # and 53th week should still be part of the encoding + vals_exp = np.eye(weeks_special_year - 1, weeks_special_year)[ + : len(index_weeks) + ] + assert vals_exp.shape[1] == weeks_special_year self.helper_routine( index_weeks, "week_of_year", vals_exp=vals_exp, one_hot=True ) From 402da1b67caa094d02272b220b896378436f1501 Mon Sep 17 00:00:00 2001 From: Konstantin Ramthun Date: Sat, 22 Nov 2025 16:40:21 +0100 Subject: [PATCH 3/4] feat: Added frequency aware one-hot encoding for dynamic frequencieslike business days. --- .../tests/utils/test_timeseries_generation.py | 10 +- darts/utils/timeseries_generation.py | 191 +++++++++--------- 2 files changed, 107 insertions(+), 94 deletions(-) diff --git a/darts/tests/utils/test_timeseries_generation.py b/darts/tests/utils/test_timeseries_generation.py index 1f627dd7b1..2d3547639d 100644 --- a/darts/tests/utils/test_timeseries_generation.py +++ b/darts/tests/utils/test_timeseries_generation.py @@ -451,8 +451,14 @@ def test_datetime_attribute_timeseries_wrong_args(self): "day", to_offset("B"), pd.Timestamp(year=2025, month=1, day=1), - ValueError, - id="business_day_value_error", + np.arange(31), + id="day_business_daily", + ), + pytest.param( + "nanosecond", + to_offset("999999ns"), + pd.Timestamp(year=2000, month=1, day=1), + np.arange(1000), ), ], ) diff --git a/darts/utils/timeseries_generation.py b/darts/utils/timeseries_generation.py index b92194ae77..b71cb59e25 100644 --- a/darts/utils/timeseries_generation.py +++ b/darts/utils/timeseries_generation.py @@ -10,6 +10,7 @@ import holidays import numpy as np import pandas as pd +from pandas.tseries.offsets import Tick from darts.logging import get_logger, raise_if, raise_if_not, raise_log from darts.timeseries import ( @@ -57,24 +58,30 @@ "weekofyear": 52 + 1, "week_of_year": 52 + 1, } -PERIOD_BY_ATTRIBTUE = { - "month": pd.Timedelta(days=366), - "day": pd.Timedelta(days=31), - "weekday": pd.Timedelta(days=7), - "dayofweek": pd.Timedelta(days=7), - "day_of_week": pd.Timedelta(days=7), - "hour": pd.Timedelta(hours=24), - "minute": pd.Timedelta(minutes=60), - "second": pd.Timedelta(seconds=60), - "microsecond": pd.Timedelta(microseconds=1000000), - "nanosecond": pd.Timedelta(nanoseconds=1000), - "quarter": pd.Timedelta(days=366), # approx - "dayofyear": pd.Timedelta(days=366), - "day_of_year": pd.Timedelta(days=366), - "week": pd.Timedelta(weeks=53), - "weekofyear": pd.Timedelta(weeks=53), - "week_of_year": pd.Timedelta(weeks=53), +FULL_CALENDAR_CYCLE = pd.Timedelta(days=365 * 28 + 7) # ~28 years +"""The solar calendar cycle (https://en.wikipedia.org/wiki/Solar_cycle_(calendar)) of the Julian calendar.""" + +MAX_GENERATION_STEPS = 100000 +"""Threshold to prevent generating too massive arrays when calculating unique datetime attribute values.""" + +ATTRIBUTE_PERIODS = { + "microsecond": pd.Timedelta("1s"), + "nanosecond": pd.Timedelta("1us"), + "second": pd.Timedelta("1min"), + "minute": pd.Timedelta("1h"), + "hour": pd.Timedelta("1D"), + "weekday": pd.Timedelta("1W"), + "day_of_week": pd.Timedelta("1W"), + "day": FULL_CALENDAR_CYCLE, + "month": FULL_CALENDAR_CYCLE, + "dayofyear": FULL_CALENDAR_CYCLE, + "week": FULL_CALENDAR_CYCLE, } +"""The time is takes for an attribute to naturally reset/wrap around. + +For example, minutes wrap around every hour, hours wrap around every day, etc. +""" + DATETIME_ATT_WITH_VARIABLE_MAX = [ "day", "dayofyear", @@ -83,6 +90,7 @@ "weekofyear", "week_of_year", ] +"""Time index attributes whose maximum value varies (e.g., day of month (ยด28, 30 or 31), week of year (52 or 53)).""" def constant_timeseries( @@ -685,8 +693,8 @@ def _timedelta_lcm(td1: pd.Timedelta, td2: pd.Timedelta) -> pd.Timedelta: def unique_datetime_value_freq_aware( - attribute: str, freq: pd.tseries.offsets.BaseOffset, start: pd.Timestamp -) -> np.ndarray[int]: + attribute: str, freq: Union[str, pd.tseries.offsets.BaseOffset], start: pd.Timestamp +) -> np.ndarray[tuple[int], int]: """Returns a sorted array of unqiue values that the given datetime attribute can take, based on `freq` and `start`. Parameters @@ -702,18 +710,27 @@ def unique_datetime_value_freq_aware( Returns ------- - np.ndarray[int] Sorted array of all the unique values that the given datetime attribute can take. See Also -------- unique_datetime_values: When all possible values for the attribute are to be returned. - Warnings - -------- - For attributes with a variable number of maximum values (day, dayofyear, day_of_year, week, weekofyear, - week_of_year), this function will return all possible values as fallback, since actually computing the values - would be inefficient. + Notes + ----- + This function determines unique values using one of three strategies: + + 1. **Exact Synchronization:** For fixed frequencies, it simulates the exact period where the frequency and attribute + cycle align (LCM). + * *Example:* ``attribute="hour", freq="2H"`` -> Returns even hours ``[0, 2, ..., 22]``. + + 2. **Calendar Simulation:** For variable frequencies (e.g., Business Days), it simulates a 28-year cycle to + guarantee capturing leap years and weekday shifts. + * *Example:* ``attribute="day", freq="B"`` -> Returns ``[1..31]`` (ensures Feb 29th is eventually captured). + + 3. **Heuristic Fallback:** If the simulation requires generating an excessive number of points (e.g., high-frequency + data for low-frequency attributes), it assumes all theoretically possible values occur. + * *Example:* ``attribute="month", freq="1min"`` -> Returns ``[1..12]`` immediately to save memory. Examples -------- @@ -725,75 +742,65 @@ def unique_datetime_value_freq_aware( >>> unique_datetime_values("minute", "15min", pd.Timestamp("2020-01-01")) array([0, 15, 30, 45]) """ - raise_if_not( - attribute in MAX_DATETIME_VALUES, - f"Can't determine unique values for attribute `{attribute}`, required for cyclic and one-hot encodings. " - f"Supported datetime attribute: {list(MAX_DATETIME_VALUES.keys())}", - logger, - ) - # Common frequencies, which are not convertable to pd.Timedelta - fixed_yearly = { - "month", - "day", - "hour", - "minute", - "second", - "microsecond", - "nanosecond", - "quarter", - } - fixed_monthly = {"day", "hour", "minute", "second", "microsecond", "nanosecond"} - fixed_attributes = { - pd.tseries.offsets.YearBegin: fixed_yearly, - pd.tseries.offsets.YearEnd: fixed_yearly, - pd.tseries.offsets.MonthBegin: fixed_monthly, - pd.tseries.offsets.MonthEnd: fixed_monthly, - } - if type(freq) in fixed_attributes: - if attribute in fixed_attributes[type(freq)]: - val = np.array([getattr(start, attribute)]) - if attribute in ONE_INDEXED_FREQS: - val -= 1 - return val - else: - return unique_datetime_values(attribute) - # Handle other frequencies - freq_delta = None + # 1. Get the Natural Period of the attribute (~28 years as safe default) + natural_period = ATTRIBUTE_PERIODS.get(attribute, FULL_CALENDAR_CYCLE) + + # 2. Try to convert frequency to Timedelta + freq_td: Optional[pd.Timedelta] = None try: - freq_delta = pd.Timedelta(freq.freqstr) - except ValueError as e: - if e.args and "unit abbreviation w/o a number" in e.args[0]: - try: - freq_delta = pd.Timedelta(1, unit=freq.freqstr) - except ValueError: - pass - finally: - if freq_delta is None: - raise_log( - ValueError( - f"Can't convert freq `{freq.freqstr}` to pd.Timedelta, required for computing unique values for " - f"attribute `{attribute}`. Please provide a frequency that can be converted to pd.Timedelta, " - f"e.g. '15min', '1H', '3D', '1W'. Alternatively, use a frequency unaware encoding or omit the " - "attribute." - ), - logger, - ) - if attribute in DATETIME_ATT_WITH_VARIABLE_MAX: - # For these attributes, periods must be really long to capture all possible values - # - logger.warning( - "Finding unique values for attribute `%s` based on frequency uses all possible values as fallback.", - attribute, - ) - return unique_datetime_values(attribute) - lcm = _timedelta_lcm(freq_delta, PERIOD_BY_ATTRIBTUE[attribute]) - num_unique = lcm // freq_delta - idx = pd.date_range(start=start, freq=freq_delta, periods=num_unique) - values: pd.Index = _get_datetime_attribute_values(attribute, idx).sort_values() - return values.unique().to_numpy() + offset = pd.tseries.frequencies.to_offset(freq) + if isinstance(offset, Tick): + freq_td = pd.Timedelta(offset) + + except (ValueError, TypeError): + # Handle raw strings that to_offset might not like, but to_timedelta might + # e.g., "15min" is fine, but sometimes complex strings fail to_offset + pass + + # Fallback: Try direct string-to-timedelta conversion if the above failed + # This handles strings like "10us" if to_offset failed + if freq_td is None: + try: + freq_td = pd.to_timedelta(freq) + except (ValueError, TypeError): + # If this fails, it is truly a variable frequency (e.g. 'M', 'B') + pass + + # 3. Dynamic Duration Calculation + if freq_td is not None: + # How long until the Freq and the Attribute Period sync up? + total_duration = _timedelta_lcm(freq_td, natural_period) + # Check how many points this requires + num_points = total_duration // freq_td + + # Safety fallback: If the interference pattern requires a large number of points + if num_points > MAX_GENERATION_STEPS: + return unique_datetime_values(attribute) + + # Otherwise, simulate exact LCM duration + idx = pd.date_range(start=start, periods=num_points, freq=freq_td) + + else: + # Variable frequency (e.g. 'BusinessDay') + # We cannot calculate LCM easily. We fallback to the Safe Horizon (28 years). + # 28 Years covers the synchronization of Weekdays, Leap Years, and Days. + end_date = start + FULL_CALENDAR_CYCLE + + # Heuristic check for variable freqs: + # If we are doing 'BusinessHour' over 28 years, that is too huge. + # Estimate points: 28 years / rough estimate of freq. + # If freq is unknown, we just run generation with a cap. + idx = pd.date_range(start=start, end=end_date, freq=freq) + + if len(idx) > MAX_GENERATION_STEPS: + return unique_datetime_values(attribute) + + # 4. Return unique values + values = _get_datetime_attribute_values(attribute, idx) + return np.unique(values).astype(int) -def unique_datetime_values(attribute: str) -> np.ndarray[int]: +def unique_datetime_values(attribute: str) -> np.ndarray[tuple[int], int]: """Returns a sorted array of all the unique values that the given datetime attribute can take. Parameters @@ -805,7 +812,7 @@ def unique_datetime_values(attribute: str) -> np.ndarray[int]: Returns ------- - np.ndarray[int] + np.ndarray[tuple[int], int] Sorted array of all the unique values that the given datetime attribute can take. See Also From 7e7fa62e588a22183187898d193bbca53a6286a3 Mon Sep 17 00:00:00 2001 From: Konstantin Ramthun Date: Sat, 29 Nov 2025 15:42:10 +0100 Subject: [PATCH 4/4] docs: Improved docs of datetime_attribute_timeseries --- darts/utils/timeseries_generation.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/darts/utils/timeseries_generation.py b/darts/utils/timeseries_generation.py index b71cb59e25..90fba0def1 100644 --- a/darts/utils/timeseries_generation.py +++ b/darts/utils/timeseries_generation.py @@ -867,9 +867,9 @@ def datetime_attribute_timeseries( Boolean value indicating whether to add the specified attribute as a one hot encoding (results in more columns). one_hot_freq_aware - Boolean value that controls the behavior of one-hot encoding. If `True`, the one-hot encoding includes only the - actual possible values based on the frequency and start of the time index. If `False`, the encoding includes - all possibl values for the attribute. + If `True`, the one-hot encoding infers which values are actually possible based on the frequency and start of + the time index. If `False`, the encoding includes all possible values for the attribute. Only has an effect if + `one_hot` is `True`. cyclic Boolean value indicating whether to add the specified attribute as a cyclic encoding. Alternative to one_hot encoding, enable only one of the two. @@ -957,8 +957,7 @@ def datetime_attribute_timeseries( if with_columns is None: with_columns = [ - attribute + "_" + str(column_name) - for column_name in values_df.columns + f"{attribute}_{column_name}" for column_name in values_df.columns ] else: raise_if_not(