Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api_reference/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ API Reference
.. toctree::
:maxdepth: 2

utils
plumbing
processing
regressor
Expand Down
7 changes: 7 additions & 0 deletions docs/api_reference/utils.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Utils Modules
===============

Tides utility functions and class.
Mostly for handling tags, generating tree, or finding and selecting data gaps.

.. autofunction:: tide.utils.tide_request
8 changes: 4 additions & 4 deletions tests/test_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -993,7 +993,7 @@ def test_replace_tag(self):
def test_add_fourier_pairs(self):
test_df = pd.DataFrame(
data=np.arange(24).astype("float64"),
index=pd.date_range("2009-01-01 00:00:00", freq="H", periods=24, tz="UTC"),
index=pd.date_range("2009-01-01 00:00:00", freq="h", periods=24, tz="UTC"),
columns=["feat_1"],
)

Expand Down Expand Up @@ -1036,14 +1036,14 @@ def test_add_fourier_pairs(self):
"1 days 00:00:00_order_2_Sine",
"1 days 00:00:00_order_2_Cosine",
],
index=pd.date_range("2009-01-01 00:00:00", freq="H", periods=24, tz="UTC"),
index=pd.date_range("2009-01-01 00:00:00", freq="h", periods=24, tz="UTC"),
)

pd.testing.assert_frame_equal(res, ref_df)

test_df_phi = pd.DataFrame(
data=np.arange(24),
index=pd.date_range("2009-01-01 06:00:00", freq="H", periods=24),
index=pd.date_range("2009-01-01 06:00:00", freq="h", periods=24),
columns=["feat_1"],
)
test_df_phi = test_df_phi.tz_localize("UTC")
Expand All @@ -1053,7 +1053,7 @@ def test_add_fourier_pairs(self):

test_df = pd.DataFrame(
data=np.arange(24).astype("float64"),
index=pd.date_range("2009-01-01 00:00:00", freq="H", periods=24, tz="UTC"),
index=pd.date_range("2009-01-01 00:00:00", freq="h", periods=24, tz="UTC"),
columns=["feat_1__°C__building__room"],
)

Expand Down
29 changes: 21 additions & 8 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
data_columns_to_tree,
get_data_col_names_from_root,
get_data_level_values,
parse_request_to_col_names,
tide_request,
timedelta_to_int,
NamedList,
_get_series_bloc,
Expand Down Expand Up @@ -58,7 +58,7 @@ def test_columns_parser(self):
assert all(col in DF_COLUMNS.columns for col in col_names)

def test_parse_request_to_col_names(self):
res = parse_request_to_col_names(DF_COLUMNS)
res = tide_request(DF_COLUMNS)
assert res == [
"name_1__°C__bloc1",
"name_1__°C__bloc2",
Expand All @@ -69,10 +69,13 @@ def test_parse_request_to_col_names(self):
"name4__DIMENSIONLESS__bloc4",
]

res = parse_request_to_col_names(DF_COLUMNS, "name_1__°C__bloc1")
res = tide_request(DF_COLUMNS, "name_1__°C__bloc1")
assert res == ["name_1__°C__bloc1"]

res = parse_request_to_col_names(
res = tide_request(DF_COLUMNS, ["name_1__°C__bloc1"])
assert res == ["name_1__°C__bloc1"]

res = tide_request(
DF_COLUMNS,
[
"name_1__°C__bloc1",
Expand All @@ -84,18 +87,28 @@ def test_parse_request_to_col_names(self):
"name_1__°C__bloc2",
]

res = parse_request_to_col_names(DF_COLUMNS, "°C")
res = tide_request(DF_COLUMNS, "°C")
assert res == ["name_1__°C__bloc1", "name_1__°C__bloc2"]

res = parse_request_to_col_names(DF_COLUMNS, "OTHER")
res = tide_request(DF_COLUMNS, "OTHER")
assert res == ["name_2", "name_3__kWh/m²", "name_5__kWh"]

res = parse_request_to_col_names(DF_COLUMNS, "DIMENSIONLESS__bloc2")
res = tide_request(DF_COLUMNS, "DIMENSIONLESS__bloc2")
assert res == ["name_2__DIMENSIONLESS__bloc2"]

res = parse_request_to_col_names(DF_COLUMNS, "kWh")
res = tide_request(DF_COLUMNS, "kWh")
assert res == ["name_5__kWh"]

res = tide_request(DF_COLUMNS, "kWh|°C")
assert res == ["name_5__kWh", "name_1__°C__bloc1", "name_1__°C__bloc2"]

res = tide_request(DF_COLUMNS, ["kWh|°C", "name_5__kWh"])
assert res == [
"name_5__kWh",
"name_1__°C__bloc1",
"name_1__°C__bloc2",
]

def test_get_data_level_names(self):
root = data_columns_to_tree(DF_COLUMNS.columns)
res = get_data_level_values(root, "name")
Expand Down
4 changes: 2 additions & 2 deletions tide/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from tide.utils import (
check_and_return_dt_index_df,
parse_request_to_col_names,
tide_request,
data_columns_to_tree,
get_data_level_values,
get_data_blocks,
Expand Down Expand Up @@ -63,7 +63,7 @@ def get_cols_axis_maps_and_labels(
col_axes_map = {}
axes_col_map = {}
for i, tag in enumerate(y_tags):
selected_cols = parse_request_to_col_names(columns, tag)
selected_cols = tide_request(columns, tag)
axes_col_map["y" if i == 0 else f"y{i + 1}"] = selected_cols
for col in selected_cols:
col_axes_map[col] = {"yaxis": "y"} if i == 0 else {"yaxis": f"y{i + 1}"}
Expand Down
14 changes: 6 additions & 8 deletions tide/plumbing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from sklearn.compose import ColumnTransformer

from tide.utils import (
parse_request_to_col_names,
tide_request,
check_and_return_dt_index_df,
data_columns_to_tree,
get_data_level_values,
Expand Down Expand Up @@ -62,7 +62,7 @@ def _get_column_wise_transformer(
) -> ColumnTransformer | None:
col_trans_list = []
for req, proc_list in proc_dict.items():
requested_col = parse_request_to_col_names(data_columns, req)
requested_col = tide_request(data_columns, req)
if not requested_col:
pass
else:
Expand Down Expand Up @@ -358,7 +358,7 @@ def select(
pd.Index
Selected column names
"""
return parse_request_to_col_names(self.data, select)
return tide_request(self.data, select)

def get_pipeline(
self,
Expand Down Expand Up @@ -438,7 +438,7 @@ def get_pipeline(
"""
if self.data is None:
raise ValueError("data is required to build a pipeline")
selection = parse_request_to_col_names(self.data, select)
selection = tide_request(self.data, select)
if steps is None or self.pipe_dict is None:
dict_to_pipe = None
else:
Expand Down Expand Up @@ -541,7 +541,7 @@ def get_corrected_data(
"""
if self.data is None:
raise ValueError("Cannot get corrected data. data are missing")
select = parse_request_to_col_names(self.data, select)
select = tide_request(self.data, select)
data = self.data.loc[
start or self.data.index[0] : stop or self.data.index[-1], select
].copy()
Expand Down Expand Up @@ -834,9 +834,7 @@ def plot(
# for example) So we just process the whole data hoping to find the result
# after.
select_corr = (
self.data.columns
if not parse_request_to_col_names(self.data, select)
else select
self.data.columns if not tide_request(self.data, select) else select
)

data_1 = self.get_corrected_data(select_corr, start, stop, steps, verbose)
Expand Down
6 changes: 2 additions & 4 deletions tide/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
get_data_blocks,
get_outer_timestamps,
check_and_return_dt_index_df,
parse_request_to_col_names,
tide_request,
ensure_list,
)
from tide.regressors import SkSTLForecast, SkProphet
Expand Down Expand Up @@ -1269,9 +1269,7 @@ def _fit_implementation(self, X: pd.Series | pd.DataFrame, y=None):
if self.tide_format_methods:
self.columns_methods = []
for req, method in self.tide_format_methods.items():
self.columns_methods.append(
(parse_request_to_col_names(X.columns, req), method)
)
self.columns_methods.append((tide_request(X.columns, req), method))

return self

Expand Down
152 changes: 124 additions & 28 deletions tide/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,40 +143,136 @@ def get_data_col_names_from_root(data_root):
][-1]


def parse_request_to_col_names(
def find_cols_with_tide_tags(
data_columns: pd.Index | list[str], request: str
) -> list[str]:
request_parts = request.split("__")

if not (1 <= len(request_parts) <= 4):
raise ValueError(
f"Request '{request}' is malformed. "
f"Use 'name__unit__bloc__sub_bloc' format or a "
f"combination of these tags."
)

full_tag_col_map = {
col_name_tag_enrichment(col, get_tags_max_level(data_columns)): col
for col in data_columns
}

def find_exact_match(search_str, target):
pattern = rf"(?:^|__)(?:{re.escape(search_str)})(?:$|__)"
match = re.search(pattern, target)
return match is not None

return [
full_tag_col_map[augmented_col]
for augmented_col in full_tag_col_map.keys()
if all(find_exact_match(part, augmented_col) for part in request_parts)
]


def find_cols_multiple_tag_groups(
data_columns: pd.Index | list[str], request: str
) -> list[str]:
request_parts = request.split("|")
list_to_return = []
for req in request_parts:
list_to_return.extend(find_cols_with_tide_tags(data_columns, req))
return list_to_return


def tide_request(
data_columns: pd.Index | list[str], request: str | pd.Index | list[str] = None
) -> list[str]:
"""
Select columns by matching structured TIDE-style tags.

Filters column names based on a TIDE-style structured tag syntax. Columns are
expected to use a naming convention with double underscores (`__`) separating
tags.

A column name can include up to four hierarchical parts:
'name__unit__bloc__sub_bloc' where each part is optional, but must be separated
with double underscores.

The `request` argument allows searching for columns matching one or more
of these parts using full or partial tag patterns. Multiple tag patterns
can be combined using the pipe `|` character to form OR conditions.

Parameters
----------
data_columns : pandas.Index or list of str
A collection of column names to filter. Each column name should follow
the TIDE format (e.g., "sensor__°C__bloc1").

request : str or list of str or pandas.Index, optional
Tag(s) to match against the column names. Each tag string may be:

- A full structured tag (e.g., "name__°C__bloc2")
- A partial tag (e.g., "°C", "bloc1")
- A group of tags separated by "|" (e.g., "kWh|°C")

If None, all columns from `data_columns` are returned.

Returns
-------
list of str
The list of column names that match any of the provided tag queries.

Notes
-----
- Matching is done per tag part, not substrings. For instance, the query
"bloc1" will match "name__°C__bloc1" but not "bloc11".
- If multiple requests are given, columns are returned if they match
at least one of them (logical OR).
- Tags can include between 1 and 4 parts, split by `__`.

Examples
--------
>>> DF_COLUMNS = [
... "name_1__°C__bloc1",
... "name_1__°C__bloc2",
... "name_2",
... "name_2__DIMENSIONLESS__bloc2",
... "name_3__kWh/m²",
... "name_5__kWh",
... "name4__DIMENSIONLESS__bloc4",
... ]

>>> tide_request(DF_COLUMNS)
['name_1__°C__bloc1', 'name_1__°C__bloc2', 'name_2',
'name_2__DIMENSIONLESS__bloc2', 'name_3__kWh/m²',
'name_5__kWh', 'name4__DIMENSIONLESS__bloc4']

>>> tide_request(DF_COLUMNS, "°C")
['name_1__°C__bloc1', 'name_1__°C__bloc2']

>>> tide_request(DF_COLUMNS, "kWh|°C")
['name_5__kWh', 'name_1__°C__bloc1', 'name_1__°C__bloc2']

>>> # Columns are not selected twice
>>> tide_request(DF_COLUMNS, ["kWh|°C", "name_5__kWh"])
['name_5__kWh', 'name_1__°C__bloc1', 'name_1__°C__bloc2']
"""

if request is None:
return list(data_columns)

elif isinstance(request, pd.Index) or isinstance(request, list):
return [col for col in request if col in data_columns]
elif isinstance(request, str):
request = [request]

else:
request_parts = request.split("__")

if not (1 <= len(request_parts) <= 4):
raise ValueError(
f"Request '{request}' is malformed. "
f"Use 'name__unit__bloc__sub_bloc' format or a "
f"combination of these tags."
)

full_tag_col_map = {
col_name_tag_enrichment(col, get_tags_max_level(data_columns)): col
for col in data_columns
}

def find_exact_match(search_str, target):
pattern = rf"(?:^|__)(?:{re.escape(search_str)})(?:$|__)"
match = re.search(pattern, target)
return match is not None

return [
full_tag_col_map[augmented_col]
for augmented_col in full_tag_col_map.keys()
if all(find_exact_match(part, augmented_col) for part in request_parts)
]
if not (isinstance(request, pd.Index) or isinstance(request, list)):
raise ValueError(
"Invalid request. Was expected an instance of str, pd.Index or List[str]"
f"got {type(request)} instead"
)

list_to_return = []
for req in request:
list_to_return.extend(find_cols_multiple_tag_groups(data_columns, req))

return list(dict.fromkeys(list_to_return))


def data_columns_to_tree(columns: pd.Index | list[str]) -> T:
Expand Down