From df1fb1f48e0bd0861bf603c0b383e3836e177bfa Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Thu, 27 Feb 2025 15:19:17 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=92=A5CombineColumns.=20Selection=20can?= =?UTF-8?q?=20no=20longer=20be=20performed=20in=20processor.=20Available?= =?UTF-8?q?=20methods=20are=20mapped?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_processing.py | 39 ++++++++++++------ tide/meteo.py | 2 +- tide/processing.py | 88 ++++++++++++++++------------------------ 3 files changed, 61 insertions(+), 68 deletions(-) diff --git a/tests/test_processing.py b/tests/test_processing.py index 9de3aa1..ff9a382 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -483,14 +483,12 @@ def test_pd_gaussian_filter(self): def test_pd_combine_columns(self): x_in = pd.DataFrame( - {"a__°C": [1, 2], "b__°C": [1, 2], "c": [1, 2]}, + {"a__°C": [1, 2], "b__°C": [2, 4]}, index=pd.date_range("2009", freq="h", periods=2, tz="UTC"), ) trans = CombineColumns( - function=np.sum, - columns=["a__°C", "b__°C"], - function_kwargs={"axis": 1}, + function="sum", drop_columns=True, ) @@ -498,29 +496,44 @@ def test_pd_combine_columns(self): pd.testing.assert_frame_equal( res, pd.DataFrame( - {"c": [1, 2], "combined": [2, 4]}, + {"combined": [3, 6]}, index=pd.date_range("2009", freq="h", periods=2, tz="UTC"), ), ) + + trans = CombineColumns( + function="mean", + drop_columns=True, + ) + + res = trans.fit_transform(x_in.copy()) + pd.testing.assert_frame_equal( + res, + pd.DataFrame( + {"combined": [1.5, 3]}, + index=pd.date_range("2009", freq="h", periods=2, tz="UTC"), + ), + ) + check_feature_names_out(trans, res) ref = x_in.copy() - ref["combined"] = [2, 4] - trans.set_params(drop_columns=False) - res = trans.fit_transform(x_in) + ref["combined"] = [1.8, 3.6] + trans.set_params(function="average", weights=[1, 4], drop_columns=False) + res = trans.fit_transform(x_in.copy()) pd.testing.assert_frame_equal(res, ref) check_feature_names_out(trans, res) - ref["combined_2"] = [2, 4] + ref = x_in.copy() + ref["combined_2"] = [5, 10] trans = CombineColumns( - function=np.sum, - tide_format_columns="°C", - function_kwargs={"axis": 1}, + function="dot", + weights=[1, 2], drop_columns=False, result_column_name="combined_2", ) - res = trans.fit_transform(x_in) + res = trans.fit_transform(x_in.copy()) pd.testing.assert_frame_equal(res, ref) check_feature_names_out(trans, res) diff --git a/tide/meteo.py b/tide/meteo.py index 760eea3..4a87422 100644 --- a/tide/meteo.py +++ b/tide/meteo.py @@ -20,7 +20,7 @@ "surface_thermal_radiation": "surface_thermal_radiation (W/m^2)", "total_cloud_cover": "total_cloud_cover (0-1)", "total_precipitation": "total_precipitation (mm of water equivalent)", - "wind_direction": "wind_direction (deg)" + "wind_direction": "wind_direction (deg)", } diff --git a/tide/processing.py b/tide/processing.py index 81a8c47..c128b1b 100644 --- a/tide/processing.py +++ b/tide/processing.py @@ -20,6 +20,8 @@ from tide.classifiers import STLEDetector from tide.meteo import sun_position, beam_component, sky_diffuse, ground_diffuse +FUNCTION_MAP = {"mean": np.mean, "average": np.average, "sum": np.sum, "dot": np.dot} + MODEL_MAP = {"STL": SkSTLForecast, "Prophet": SkProphet} OIKOLAB_DEFAULT_MAP = { @@ -916,83 +918,61 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class CombineColumns(BaseProcessing): """ - A class that combines multiple columns in a pandas DataFrame using a specified - function. + A class that combines multiple columns in a pandas DataFrame using mean, sum, + average, or dot. Original columns can be dropped. Parameters ---------- - function (callable or None): A function or method to apply for combining - columns. - tide_format_columns str: Tide request format. Columns are determined using - tide columns format name__unit__bloc. It override the columns attribute - columns (list or None): A list of column names to combine. - If None, all columns will be combined. - - function_kwargs (dict or None): Additional keyword arguments to pass to the - combining function. - drop_columns (bool): If True, the original columns to combine will be dropped - from the DataFrame. If False, the original columns will be retained. - label_name (str): The name of the new column that will store the combined - values. - - Attributes - ---------- - columns : list - The column names of the input DataFrame. - index : pandas.Index - The index of the input DataFrame. - - Methods - ------- - get_feature_names_out(input_features=None) - Get output feature names for the transformed data. - fit(X, y=None) - Fit the transformer to the input data. - transform(X, y=None) - Transform the input data by applying the function + function (str): The name of the function to apply for combining columns. + Valide names are "mean", "sum", "average", "dot". + weights (list[float | int] or np.ndarray, optional): Weights to apply when + using 'average' or 'dot'. Ignored for functions like 'mean' or 'sum'. + drop_columns (bool): If True, the original columns used for combining will + be dropped from the DataFrame. If False, they will be retained. + result_column_name (str): The name of the new column that will store the + combined values. """ def __init__( self, - function: Callable, - tide_format_columns: str = None, - columns=None, - function_kwargs: dict = {}, + function: str, + weights: list[float | int] | np.ndarray = None, drop_columns: bool = False, result_column_name: str = "combined", ): BaseProcessing.__init__(self) self.function = function - self.tide_format_columns = tide_format_columns - self.columns = columns - self.function_kwargs = function_kwargs + self.weights = weights self.drop_columns = drop_columns self.result_column_name = result_column_name def _fit_implementation(self, X: pd.Series | pd.DataFrame, y=None): - if self.columns is None and self.tide_format_columns is None: - raise ValueError("Provide at least one of columns or tide_format_columns") - - self.required_columns = ( - parse_request_to_col_names(X.columns, self.tide_format_columns) - if self.tide_format_columns - else self.columns - ) self.fit_check_features(X) - if self.drop_columns: - self.feature_names_out_ = list(X.columns.drop(self.required_columns)) - if self.result_column_name in self.feature_names_out_: + self.method_ = FUNCTION_MAP[self.function] + if self.function in ["mean", "sum"] and self.weights is not None: raise ValueError( - f"label_name {self.result_column_name} already in X columns. " - f"It cannot be overwritten" + f"Weights have been provided, but {self.function} " + f"cannot use it. Use one of 'average' or 'dot'" ) + + if self.drop_columns: + self.feature_names_out_ = [] + self.feature_names_out_.append(self.result_column_name) def _transform_implementation(self, X: pd.Series | pd.DataFrame): - check_is_fitted(self, attributes=["feature_names_in_", "feature_names_out_"]) - X[self.result_column_name] = self.function( - X[self.required_columns], **self.function_kwargs + check_is_fitted( + self, attributes=["feature_names_in_", "feature_names_out_", "method_"] ) + + if self.function == "average": + X[self.result_column_name] = self.method_(X, axis=1, weights=self.weights) + elif self.function == "dot": + X[self.result_column_name] = self.method_(X, self.weights) + + else: + X[self.result_column_name] = self.method_(X, axis=1) + return X[self.feature_names_out_]