diff --git a/docs/api_reference/index.rst b/docs/api_reference/index.rst index fc4d4c7..0a25966 100644 --- a/docs/api_reference/index.rst +++ b/docs/api_reference/index.rst @@ -4,6 +4,7 @@ API Reference .. toctree:: :maxdepth: 2 + utils plumbing processing regressor diff --git a/docs/api_reference/utils.rst b/docs/api_reference/utils.rst new file mode 100644 index 0000000..97dc156 --- /dev/null +++ b/docs/api_reference/utils.rst @@ -0,0 +1,7 @@ +Utils Modules +=============== + +Tides utility functions and class. +Mostly for handling tags, generating tree, or finding and selecting data gaps. + +.. autofunction:: tide.utils.tide_request \ No newline at end of file diff --git a/tests/test_processing.py b/tests/test_processing.py index 19bce1d..39dbbb2 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -993,7 +993,7 @@ def test_replace_tag(self): def test_add_fourier_pairs(self): test_df = pd.DataFrame( data=np.arange(24).astype("float64"), - index=pd.date_range("2009-01-01 00:00:00", freq="H", periods=24, tz="UTC"), + index=pd.date_range("2009-01-01 00:00:00", freq="h", periods=24, tz="UTC"), columns=["feat_1"], ) @@ -1036,14 +1036,14 @@ def test_add_fourier_pairs(self): "1 days 00:00:00_order_2_Sine", "1 days 00:00:00_order_2_Cosine", ], - index=pd.date_range("2009-01-01 00:00:00", freq="H", periods=24, tz="UTC"), + index=pd.date_range("2009-01-01 00:00:00", freq="h", periods=24, tz="UTC"), ) pd.testing.assert_frame_equal(res, ref_df) test_df_phi = pd.DataFrame( data=np.arange(24), - index=pd.date_range("2009-01-01 06:00:00", freq="H", periods=24), + index=pd.date_range("2009-01-01 06:00:00", freq="h", periods=24), columns=["feat_1"], ) test_df_phi = test_df_phi.tz_localize("UTC") @@ -1053,7 +1053,7 @@ def test_add_fourier_pairs(self): test_df = pd.DataFrame( data=np.arange(24).astype("float64"), - index=pd.date_range("2009-01-01 00:00:00", freq="H", periods=24, tz="UTC"), + index=pd.date_range("2009-01-01 00:00:00", freq="h", periods=24, tz="UTC"), columns=["feat_1__°C__building__room"], ) diff --git a/tests/test_utils.py b/tests/test_utils.py index 9461b9c..2962ca2 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -10,7 +10,7 @@ data_columns_to_tree, get_data_col_names_from_root, get_data_level_values, - parse_request_to_col_names, + tide_request, timedelta_to_int, NamedList, _get_series_bloc, @@ -58,7 +58,7 @@ def test_columns_parser(self): assert all(col in DF_COLUMNS.columns for col in col_names) def test_parse_request_to_col_names(self): - res = parse_request_to_col_names(DF_COLUMNS) + res = tide_request(DF_COLUMNS) assert res == [ "name_1__°C__bloc1", "name_1__°C__bloc2", @@ -69,10 +69,13 @@ def test_parse_request_to_col_names(self): "name4__DIMENSIONLESS__bloc4", ] - res = parse_request_to_col_names(DF_COLUMNS, "name_1__°C__bloc1") + res = tide_request(DF_COLUMNS, "name_1__°C__bloc1") assert res == ["name_1__°C__bloc1"] - res = parse_request_to_col_names( + res = tide_request(DF_COLUMNS, ["name_1__°C__bloc1"]) + assert res == ["name_1__°C__bloc1"] + + res = tide_request( DF_COLUMNS, [ "name_1__°C__bloc1", @@ -84,18 +87,28 @@ def test_parse_request_to_col_names(self): "name_1__°C__bloc2", ] - res = parse_request_to_col_names(DF_COLUMNS, "°C") + res = tide_request(DF_COLUMNS, "°C") assert res == ["name_1__°C__bloc1", "name_1__°C__bloc2"] - res = parse_request_to_col_names(DF_COLUMNS, "OTHER") + res = tide_request(DF_COLUMNS, "OTHER") assert res == ["name_2", "name_3__kWh/m²", "name_5__kWh"] - res = parse_request_to_col_names(DF_COLUMNS, "DIMENSIONLESS__bloc2") + res = tide_request(DF_COLUMNS, "DIMENSIONLESS__bloc2") assert res == ["name_2__DIMENSIONLESS__bloc2"] - res = parse_request_to_col_names(DF_COLUMNS, "kWh") + res = tide_request(DF_COLUMNS, "kWh") assert res == ["name_5__kWh"] + res = tide_request(DF_COLUMNS, "kWh|°C") + assert res == ["name_5__kWh", "name_1__°C__bloc1", "name_1__°C__bloc2"] + + res = tide_request(DF_COLUMNS, ["kWh|°C", "name_5__kWh"]) + assert res == [ + "name_5__kWh", + "name_1__°C__bloc1", + "name_1__°C__bloc2", + ] + def test_get_data_level_names(self): root = data_columns_to_tree(DF_COLUMNS.columns) res = get_data_level_values(root, "name") diff --git a/tide/plot.py b/tide/plot.py index 218443a..3b58318 100644 --- a/tide/plot.py +++ b/tide/plot.py @@ -6,7 +6,7 @@ from tide.utils import ( check_and_return_dt_index_df, - parse_request_to_col_names, + tide_request, data_columns_to_tree, get_data_level_values, get_data_blocks, @@ -63,7 +63,7 @@ def get_cols_axis_maps_and_labels( col_axes_map = {} axes_col_map = {} for i, tag in enumerate(y_tags): - selected_cols = parse_request_to_col_names(columns, tag) + selected_cols = tide_request(columns, tag) axes_col_map["y" if i == 0 else f"y{i + 1}"] = selected_cols for col in selected_cols: col_axes_map[col] = {"yaxis": "y"} if i == 0 else {"yaxis": f"y{i + 1}"} diff --git a/tide/plumbing.py b/tide/plumbing.py index 19e7139..21591aa 100644 --- a/tide/plumbing.py +++ b/tide/plumbing.py @@ -8,7 +8,7 @@ from sklearn.compose import ColumnTransformer from tide.utils import ( - parse_request_to_col_names, + tide_request, check_and_return_dt_index_df, data_columns_to_tree, get_data_level_values, @@ -62,7 +62,7 @@ def _get_column_wise_transformer( ) -> ColumnTransformer | None: col_trans_list = [] for req, proc_list in proc_dict.items(): - requested_col = parse_request_to_col_names(data_columns, req) + requested_col = tide_request(data_columns, req) if not requested_col: pass else: @@ -358,7 +358,7 @@ def select( pd.Index Selected column names """ - return parse_request_to_col_names(self.data, select) + return tide_request(self.data, select) def get_pipeline( self, @@ -438,7 +438,7 @@ def get_pipeline( """ if self.data is None: raise ValueError("data is required to build a pipeline") - selection = parse_request_to_col_names(self.data, select) + selection = tide_request(self.data, select) if steps is None or self.pipe_dict is None: dict_to_pipe = None else: @@ -541,7 +541,7 @@ def get_corrected_data( """ if self.data is None: raise ValueError("Cannot get corrected data. data are missing") - select = parse_request_to_col_names(self.data, select) + select = tide_request(self.data, select) data = self.data.loc[ start or self.data.index[0] : stop or self.data.index[-1], select ].copy() @@ -834,9 +834,7 @@ def plot( # for example) So we just process the whole data hoping to find the result # after. select_corr = ( - self.data.columns - if not parse_request_to_col_names(self.data, select) - else select + self.data.columns if not tide_request(self.data, select) else select ) data_1 = self.get_corrected_data(select_corr, start, stop, steps, verbose) diff --git a/tide/processing.py b/tide/processing.py index 9a94e22..a4660ab 100644 --- a/tide/processing.py +++ b/tide/processing.py @@ -13,7 +13,7 @@ get_data_blocks, get_outer_timestamps, check_and_return_dt_index_df, - parse_request_to_col_names, + tide_request, ensure_list, ) from tide.regressors import SkSTLForecast, SkProphet @@ -1269,9 +1269,7 @@ def _fit_implementation(self, X: pd.Series | pd.DataFrame, y=None): if self.tide_format_methods: self.columns_methods = [] for req, method in self.tide_format_methods.items(): - self.columns_methods.append( - (parse_request_to_col_names(X.columns, req), method) - ) + self.columns_methods.append((tide_request(X.columns, req), method)) return self diff --git a/tide/utils.py b/tide/utils.py index ed0bada..d064881 100644 --- a/tide/utils.py +++ b/tide/utils.py @@ -143,40 +143,136 @@ def get_data_col_names_from_root(data_root): ][-1] -def parse_request_to_col_names( +def find_cols_with_tide_tags( + data_columns: pd.Index | list[str], request: str +) -> list[str]: + request_parts = request.split("__") + + if not (1 <= len(request_parts) <= 4): + raise ValueError( + f"Request '{request}' is malformed. " + f"Use 'name__unit__bloc__sub_bloc' format or a " + f"combination of these tags." + ) + + full_tag_col_map = { + col_name_tag_enrichment(col, get_tags_max_level(data_columns)): col + for col in data_columns + } + + def find_exact_match(search_str, target): + pattern = rf"(?:^|__)(?:{re.escape(search_str)})(?:$|__)" + match = re.search(pattern, target) + return match is not None + + return [ + full_tag_col_map[augmented_col] + for augmented_col in full_tag_col_map.keys() + if all(find_exact_match(part, augmented_col) for part in request_parts) + ] + + +def find_cols_multiple_tag_groups( + data_columns: pd.Index | list[str], request: str +) -> list[str]: + request_parts = request.split("|") + list_to_return = [] + for req in request_parts: + list_to_return.extend(find_cols_with_tide_tags(data_columns, req)) + return list_to_return + + +def tide_request( data_columns: pd.Index | list[str], request: str | pd.Index | list[str] = None ) -> list[str]: + """ + Select columns by matching structured TIDE-style tags. + + Filters column names based on a TIDE-style structured tag syntax. Columns are + expected to use a naming convention with double underscores (`__`) separating + tags. + + A column name can include up to four hierarchical parts: + 'name__unit__bloc__sub_bloc' where each part is optional, but must be separated + with double underscores. + + The `request` argument allows searching for columns matching one or more + of these parts using full or partial tag patterns. Multiple tag patterns + can be combined using the pipe `|` character to form OR conditions. + + Parameters + ---------- + data_columns : pandas.Index or list of str + A collection of column names to filter. Each column name should follow + the TIDE format (e.g., "sensor__°C__bloc1"). + + request : str or list of str or pandas.Index, optional + Tag(s) to match against the column names. Each tag string may be: + + - A full structured tag (e.g., "name__°C__bloc2") + - A partial tag (e.g., "°C", "bloc1") + - A group of tags separated by "|" (e.g., "kWh|°C") + + If None, all columns from `data_columns` are returned. + + Returns + ------- + list of str + The list of column names that match any of the provided tag queries. + + Notes + ----- + - Matching is done per tag part, not substrings. For instance, the query + "bloc1" will match "name__°C__bloc1" but not "bloc11". + - If multiple requests are given, columns are returned if they match + at least one of them (logical OR). + - Tags can include between 1 and 4 parts, split by `__`. + + Examples + -------- + >>> DF_COLUMNS = [ + ... "name_1__°C__bloc1", + ... "name_1__°C__bloc2", + ... "name_2", + ... "name_2__DIMENSIONLESS__bloc2", + ... "name_3__kWh/m²", + ... "name_5__kWh", + ... "name4__DIMENSIONLESS__bloc4", + ... ] + + >>> tide_request(DF_COLUMNS) + ['name_1__°C__bloc1', 'name_1__°C__bloc2', 'name_2', + 'name_2__DIMENSIONLESS__bloc2', 'name_3__kWh/m²', + 'name_5__kWh', 'name4__DIMENSIONLESS__bloc4'] + + >>> tide_request(DF_COLUMNS, "°C") + ['name_1__°C__bloc1', 'name_1__°C__bloc2'] + + >>> tide_request(DF_COLUMNS, "kWh|°C") + ['name_5__kWh', 'name_1__°C__bloc1', 'name_1__°C__bloc2'] + + >>> # Columns are not selected twice + >>> tide_request(DF_COLUMNS, ["kWh|°C", "name_5__kWh"]) + ['name_5__kWh', 'name_1__°C__bloc1', 'name_1__°C__bloc2'] + """ + if request is None: return list(data_columns) - elif isinstance(request, pd.Index) or isinstance(request, list): - return [col for col in request if col in data_columns] + elif isinstance(request, str): + request = [request] - else: - request_parts = request.split("__") - - if not (1 <= len(request_parts) <= 4): - raise ValueError( - f"Request '{request}' is malformed. " - f"Use 'name__unit__bloc__sub_bloc' format or a " - f"combination of these tags." - ) - - full_tag_col_map = { - col_name_tag_enrichment(col, get_tags_max_level(data_columns)): col - for col in data_columns - } - - def find_exact_match(search_str, target): - pattern = rf"(?:^|__)(?:{re.escape(search_str)})(?:$|__)" - match = re.search(pattern, target) - return match is not None - - return [ - full_tag_col_map[augmented_col] - for augmented_col in full_tag_col_map.keys() - if all(find_exact_match(part, augmented_col) for part in request_parts) - ] + if not (isinstance(request, pd.Index) or isinstance(request, list)): + raise ValueError( + "Invalid request. Was expected an instance of str, pd.Index or List[str]" + f"got {type(request)} instead" + ) + + list_to_return = [] + for req in request: + list_to_return.extend(find_cols_multiple_tag_groups(data_columns, req)) + + return list(dict.fromkeys(list_to_return)) def data_columns_to_tree(columns: pd.Index | list[str]) -> T: