From ffabd2359876a35caf6e1218c97d2f156329b59a Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Thu, 27 Mar 2025 16:52:02 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20SkProphet=20improve=20to?= =?UTF-8?q?=20take=20additional=20regressor=20as=20features.=20SkProphet,?= =?UTF-8?q?=20SkSTLForecast=20fit=20and=20predict=20method=20rewrote=20for?= =?UTF-8?q?=20better=20scikit=20API=20compatibility.=20Regressor=20updated?= =?UTF-8?q?=20to=20work=20with=20these=20new=20versions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_regressors.py | 59 ++++++++++--- tide/processing.py | 7 +- tide/regressors.py | 174 ++++++++++++++++++++++++++++----------- tide/utils.py | 13 ++- 4 files changed, 189 insertions(+), 64 deletions(-) diff --git a/tests/test_regressors.py b/tests/test_regressors.py index 9191619..1748b69 100644 --- a/tests/test_regressors.py +++ b/tests/test_regressors.py @@ -23,8 +23,16 @@ def toy_data(self): ) toy_series = pd.Series(annual + daily + 5, index=index) + + exo = 12 + 3 * np.arange(index.shape[0]) + toy_df = pd.DataFrame( - {"Temp_1__°C": toy_series, "Temp_2__°C": toy_series * 1.25 + 2} + { + "Temp_1__°C": toy_series, + "Temp_2__°C": toy_series * 1.25 + 2, + "Temp_3__°C": toy_series + exo, + "Exo": exo, + } ) return toy_df @@ -36,40 +44,71 @@ def test_stl_forecaster(self, toy_data): backcast=False, ) - forecaster.fit(toy_data["2009-01-24":"2009-07-24"]) + forecaster.fit( + X=toy_data["2009-01-24":"2009-07-24"].index, + y=toy_data.loc["2009-01-24":"2009-07-24", ["Temp_1__°C", "Temp_2__°C"]], + ) reg_score = forecaster.score( - toy_data["2009-07-27":"2009-07-30"], toy_data["2009-07-27":"2009-07-30"] + toy_data["2009-07-27":"2009-07-30"], + toy_data.loc["2009-07-27":"2009-07-30", ["Temp_1__°C", "Temp_2__°C"]], ) assert reg_score > 0.99 backcaster = SkSTLForecast(backcast=True) - backcaster.fit(toy_data["2009-01-24":"2009-07-24"]) + backcaster.fit( + X=toy_data.loc["2009-01-24":"2009-07-24"], + y=toy_data.loc["2009-01-24":"2009-07-24", ["Temp_1__°C", "Temp_2__°C"]], + ) reg_score = backcaster.score( - toy_data["2009-01-20":"2009-01-22"], toy_data["2009-01-20":"2009-01-22"] + toy_data["2009-01-20":"2009-01-22"].index, + toy_data.loc["2009-01-20":"2009-01-22", ["Temp_1__°C", "Temp_2__°C"]], ) assert reg_score > 0.99 def test_prophet_forecaster(self, toy_data): forecaster = SkProphet() - forecaster.fit(toy_data["2009-01-24":"2009-07-24"]) + forecaster.fit( + X=toy_data["2009-01-24":"2009-07-24"].index, + y=toy_data.loc["2009-01-24":"2009-07-24", ["Temp_1__°C", "Temp_2__°C"]], + ) reg_score = forecaster.score( - toy_data["2009-07-27":"2009-07-30"], toy_data["2009-07-27":"2009-07-30"] + toy_data["2009-07-27":"2009-07-30"].index, + toy_data.loc["2009-07-27":"2009-07-30", ["Temp_1__°C", "Temp_2__°C"]], ) assert reg_score > 0.99 reg_score = forecaster.score( - toy_data["2009-01-20":"2009-01-22"], toy_data["2009-01-20":"2009-01-22"] + toy_data["2009-01-20":"2009-01-22"].index, + toy_data.loc["2009-01-20":"2009-01-22", ["Temp_1__°C", "Temp_2__°C"]], ) assert reg_score > 0.99 forecaster = SkProphet(return_upper_lower_bounds=True) - forecaster.fit(toy_data["2009-01-24":"2009-07-24"]) + forecaster.fit( + X=toy_data["2009-01-24":"2009-07-24"].index, + y=toy_data.loc["2009-01-24":"2009-07-24", ["Temp_1__°C", "Temp_2__°C"]], + ) + feat_out = list(forecaster.get_feature_names_out()) - predictions = forecaster.predict(toy_data["2009-07-27":"2009-07-30"]) + predictions = forecaster.predict(toy_data["2009-07-27":"2009-07-30"].index) assert np.all([feat in predictions.columns for feat in feat_out]) + + forecaster = SkProphet() + forecaster.fit( + X=toy_data.loc["2009-01-24":"2009-07-24", "Exo"], + y=toy_data.loc["2009-01-24":"2009-07-24", "Temp_3__°C"], + ) + + reg_score = forecaster.score( + toy_data.loc["2009-01-24":"2009-07-24", "Exo"], + toy_data.loc["2009-01-24":"2009-07-24", "Temp_3__°C"], + ) + + assert reg_score > 0.99 + assert forecaster.get_feature_names_in() == ["Exo"] diff --git a/tide/processing.py b/tide/processing.py index 8f74418..9a94e22 100644 --- a/tide/processing.py +++ b/tide/processing.py @@ -1912,10 +1912,9 @@ def fill_x(self, X, group, col, idx, backcast): if self.resample_at_td: self._check_forecast_horizon(idx) x_fit, idx_pred = self._get_x_and_idx_at_freq(X.loc[group, col], idx, backcast) - bc_model.fit(x_fit) - to_predict = idx_pred.to_series() - to_predict.name = col - to_predict = to_predict[to_predict.index.isin(X.index)] + bc_model.fit(X=x_fit.index, y=x_fit) + to_predict = idx_pred + to_predict = to_predict[to_predict.isin(X.index)] # Here a bit dirty. STL doesn't allow forecast on its fitting set if self.model_name == "STL": to_predict = to_predict[~to_predict.isin(x_fit.index)] diff --git a/tide/regressors.py b/tide/regressors.py index efa4a11..c174fcd 100644 --- a/tide/regressors.py +++ b/tide/regressors.py @@ -1,32 +1,43 @@ import datetime as dt -import warnings import pandas as pd import numpy as np +from pandas import DatetimeIndex from statsmodels.tsa.arima.model import ARIMA from statsmodels.tsa.forecasting.stl import STLForecast from sklearn.base import RegressorMixin, BaseEstimator -from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted from prophet import Prophet from tide.base import BaseSTL, TideBaseMixin -from tide.utils import check_and_return_dt_index_df +from tide.utils import check_and_return_dt_index_df, check_datetime_index MODEL_MAP = {"ARIMA": ARIMA} MODEL_DEFAULT_CONF = {"ARIMA": {"order": (1, 1, 0), "trend": "t"}} -def series_to_prophet_df(x: pd.Series | pd.DatetimeIndex) -> pd.DataFrame: +def format_prophet_df( + x: pd.Series | pd.DataFrame | pd.DatetimeIndex, y: pd.Series = None +) -> pd.DataFrame: df = pd.DataFrame() - if isinstance(x, pd.Series): - idx = x.index - df["y"] = x.values + if y is not None: + if not x.shape[0] == y.shape[0]: + raise ValueError("x and y have incompatible shape") + df["y"] = y.values + + if not isinstance(x, pd.DatetimeIndex): + x = check_and_return_dt_index_df(x) + df["ds"] = x.index.tz_localize(None) + df[x.columns] = x.values + elif isinstance(x, pd.DatetimeIndex): + df["ds"] = x.tz_localize(None) else: - idx = x - df["ds"] = idx.tz_localize(None) + raise ValueError( + f"Invalid x. Was expecting an instance of DateTimeIndex" + f"DataFrame or Series, got {type(x)}" + ) return df @@ -99,26 +110,29 @@ def __init__( self.ar_model = ar_model self.ar_kwargs = ar_kwargs - def fit(self, X: pd.Series | pd.DataFrame, y=None): - X = check_and_return_dt_index_df(X) + def fit(self, X: pd.Index | pd.Series | pd.DataFrame, y=pd.Series | pd.DataFrame): + if not isinstance(X, pd.DatetimeIndex): + X = check_and_return_dt_index_df(X) + y = check_and_return_dt_index_df(y) + ar_model = MODEL_MAP[self.ar_model] if self.ar_kwargs is None: ar_kwargs = MODEL_DEFAULT_CONF[self.ar_model] else: ar_kwargs = self.ar_kwargs - self._pre_fit(X) + self._pre_fit(y) self.training_freq_ = ( - X.index.freq if X.index.freq is not None else X.index.inferred_freq + y.index.freq if y.index.freq is not None else y.index.inferred_freq ) if self.backcast: - X = X[::-1] - self.train_dat_end_ = X.index[-1] + y = y[::-1] + self.train_dat_end_ = y.index[-1] self.forecaster_ = {} - for feat in X: + for feat in y: self.forecaster_[feat] = STLForecast( - endog=X[feat].to_numpy(), + endog=y[feat].to_numpy(), model=ar_model, model_kwargs=ar_kwargs, **self.stl_kwargs, @@ -126,7 +140,7 @@ def fit(self, X: pd.Series | pd.DataFrame, y=None): return self - def predict(self, X: pd.Series | pd.DataFrame): + def predict(self, X: pd.DatetimeIndex | pd.Series | pd.DataFrame): check_is_fitted( self, attributes=[ @@ -135,9 +149,11 @@ def predict(self, X: pd.Series | pd.DataFrame): "training_freq_", ], ) - - X = check_and_return_dt_index_df(X) - check_array(X) + if isinstance(X, DatetimeIndex): + check_datetime_index(X) + X = X.to_frame() + else: + X = check_and_return_dt_index_df(X) if X.index.shape[0] == 2: X.index.freq = pd.tseries.frequencies.to_offset( @@ -162,13 +178,6 @@ def predict(self, X: pd.Series | pd.DataFrame): output_index = X.index[::-1] if self.backcast else X.index - if set(self.forecaster_.keys()) != set(X.columns): - warnings.warn( - "Columns in X differs from columns in the training DataSet. " - "Forecast will be performed for the trained data", - UserWarning, - ) - casting_steps = int( len(output_index) + abs(output_index[0] - self.train_dat_end_) / self.training_freq_ @@ -220,6 +229,64 @@ class SkProphet(RegressorMixin, BaseEstimator, TideBaseMixin): Fit the Prophet model to the input data. predict(X) Make predictions using the fitted Prophet model. + + Examples + -------- + >>> import pandas as pd + >>> import numpy as np + >>> import datetime as dt + >>> from tide.regressors import SkProphet + + >>> index = pd.date_range("2009-01-01", "2009-12-31 23:00:00", freq="h", tz="UTC") + >>> cumsum_second = np.arange( + ... 0, (index[-1] - index[0]).total_seconds() + 1, step=3600 + ... ) + + >>> annual = 5 * -np.cos( + ... 2 * np.pi / dt.timedelta(days=360).total_seconds() * cumsum_second + ...) + + >>> daily = 5 * np.sin( + ... 2 * np.pi / dt.timedelta(days=1).total_seconds() * cumsum_second + ...) + + >>> toy_series = pd.Series(annual + daily + 5, index=index) + + >>> exo = 12 + 3 * np.arange(index.shape[0]) + + >>> toy_df = pd.DataFrame( + ... { + ... "Temp_3__°C": toy_series + exo, + ... "Exo": exo, + ... } + ...) + + >>> forecaster = SkProphet() + >>> forecaster.fit( + ... X=toy_df.loc["2009-01-24":"2009-07-24", "Exo"], + ... y=toy_df.loc["2009-01-24":"2009-07-24", "Temp_3__°C"], + ...) + + >>> result = forecaster.predict(X=toy_df.loc["2009-07-25":"2009-07-30", "Exo"]) + >>> print(result.head()) + Temp_3__°C + 2009-07-25 00:00:00+00:00 14781.715143 + 2009-07-25 01:00:00+00:00 14786.009401 + 2009-07-25 02:00:00+00:00 14790.215280 + 2009-07-25 03:00:00+00:00 14794.250621 + 2009-07-25 04:00:00+00:00 14798.044970 + + Notes + ----- + - Additional regressors are passed in X during fitting operation + - Holidays cannot be configured in this regressor. We recommend to pass it + as a feature during the fitting process. It will be treated as an additional + regressor + + Returns + ------- + pd.DataFrame + A DataFrame with DateTime index. Columns are the y targets """ def __init__( @@ -237,43 +304,58 @@ def __init__( self.return_upper_lower_bounds = return_upper_lower_bounds self.backcast = backcast - def fit(self, X: pd.Series | pd.DataFrame, y=None): - X = check_and_return_dt_index_df(X) + def fit(self, X: pd.Index | pd.Series | pd.DataFrame, y=pd.Series | pd.DataFrame): + y = check_and_return_dt_index_df(y) + self.feature_names_out_ = list(y.columns) + if isinstance(X, pd.Series) or isinstance(X, pd.DataFrame): + X = check_and_return_dt_index_df(X) + self.feature_names_in_ = list(X.columns) + else: + check_datetime_index(X) + self.feature_names_in_ = [] + self.forecaster_ = {} - self.fit_check_features(X) if self.return_upper_lower_bounds: self.added_columns = [] for bound in ["upper", "lower"]: - for feat in self.feature_names_in_: + for feat in self.feature_names_out_: parts = feat.split("__") parts[0] = f"{parts[0]}_{bound}" self.added_columns.append("__".join(parts)) - for feat in X: - x = series_to_prophet_df(X[feat]) - self.forecaster_[feat] = Prophet( + for target in y: + prophet_df = format_prophet_df(X, y[target]) + model = Prophet( seasonality_prior_scale=self.seasonality_prior_scale, changepoint_prior_scale=self.changepoint_prior_scale, **self.prophet_kwargs, - ).fit(x) + ) + if isinstance(X, pd.DataFrame): + for feat in X: + model.add_regressor(feat) + self.forecaster_[target] = model.fit(prophet_df) return self - def predict(self, X: pd.Series | pd.DataFrame): - X = check_and_return_dt_index_df(X) + def predict(self, X: pd.Index | pd.Series | pd.DataFrame): check_is_fitted( self, attributes=["forecaster_", "feature_names_in_"], ) - if not np.all([f in self.feature_names_in_ for f in X.columns]): - raise ValueError( - "One of the requested feature was not present during fitting" - ) - X = check_and_return_dt_index_df(X) - inferred_df = pd.DataFrame(index=X.index) + if isinstance(X, pd.Series) or isinstance(X, pd.DataFrame): + X = check_and_return_dt_index_df(X) + if not np.all([f in self.feature_names_in_ for f in X.columns]): + raise ValueError( + "One of the requested feature was not present during fitting" + ) + else: + check_datetime_index(X) + + out_idx = X if isinstance(X, pd.DatetimeIndex) else X.index + inferred_df = pd.DataFrame(index=out_idx) for feat in self.forecaster_.keys(): - x = series_to_prophet_df(X.index) - prediction = self.forecaster_[feat].predict(x) + df_prophet = format_prophet_df(X) + prediction = self.forecaster_[feat].predict(df_prophet) inferred_df[feat] = prediction["yhat"].values if self.return_upper_lower_bounds: for bound in ["upper", "lower"]: diff --git a/tide/utils.py b/tide/utils.py index 0ec969c..ed0bada 100644 --- a/tide/utils.py +++ b/tide/utils.py @@ -208,17 +208,22 @@ def data_columns_to_tree(columns: pd.Index | list[str]) -> T: return dict_to_tree(parsed_dict, sep="__") +def check_datetime_index(idx: pd.DatetimeIndex): + if not isinstance(idx, pd.DatetimeIndex): + raise ValueError("Index is not a pandas DateTime index") + + if idx.tz is None: + raise ValueError("Index must be tz_localized") + + def check_and_return_dt_index_df(X: pd.Series | pd.DataFrame) -> pd.DataFrame: if not (isinstance(X, pd.Series) or isinstance(X, pd.DataFrame)): raise ValueError( f"Invalid X data, was expected an instance of pandas Dataframe " f"or Pandas Series. Got {type(X)}" ) - if not isinstance(X.index, pd.DatetimeIndex): - raise ValueError("X index is not a pandas DateTime index") - if X.index.tz is None: - raise ValueError("X index must be tz_localized") + check_datetime_index(X.index) return X.to_frame() if isinstance(X, pd.Series) else X