From 5121800c66c20b3b2a4c2620773d00ae551845bf Mon Sep 17 00:00:00 2001 From: tashiro akira Date: Tue, 24 Oct 2023 10:40:45 +0900 Subject: [PATCH 1/3] Fixed an error when there were many missing bool columns in the input data Signed-off-by: tashiro akira --- .../preprocessing_templates/fillna-type-string.py.jinja | 2 ++ .../preprocessing_templates/fillna-type-string_predict.py.jinja | 1 + .../preprocessing_templates/fillna-type-string_train.py.jinja | 1 + 3 files changed, 4 insertions(+) diff --git a/sapientml_core/templates/preprocessing_templates/fillna-type-string.py.jinja b/sapientml_core/templates/preprocessing_templates/fillna-type-string.py.jinja index 5a8da92..5d0787b 100644 --- a/sapientml_core/templates/preprocessing_templates/fillna-type-string.py.jinja +++ b/sapientml_core/templates/preprocessing_templates/fillna-type-string.py.jinja @@ -9,6 +9,8 @@ simple_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent') {% endif %} {% if cols_almost_missing_string %} STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }} +{{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str) +{{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str) {{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('') {{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('') {% endif %} \ No newline at end of file diff --git a/sapientml_core/templates/preprocessing_templates/fillna-type-string_predict.py.jinja b/sapientml_core/templates/preprocessing_templates/fillna-type-string_predict.py.jinja index ef58ba7..472ec11 100644 --- a/sapientml_core/templates/preprocessing_templates/fillna-type-string_predict.py.jinja +++ b/sapientml_core/templates/preprocessing_templates/fillna-type-string_predict.py.jinja @@ -7,5 +7,6 @@ STRING_COLS_WITH_MISSING_VALUES = {{ columns }} {% endif %} {% if cols_almost_missing_string %} STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }} +{{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str) {{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('') {% endif %} \ No newline at end of file diff --git a/sapientml_core/templates/preprocessing_templates/fillna-type-string_train.py.jinja b/sapientml_core/templates/preprocessing_templates/fillna-type-string_train.py.jinja index 404804b..81d5621 100644 --- a/sapientml_core/templates/preprocessing_templates/fillna-type-string_train.py.jinja +++ b/sapientml_core/templates/preprocessing_templates/fillna-type-string_train.py.jinja @@ -11,5 +11,6 @@ with open('simpleimputer-string.pkl', 'wb') as f: {% endif %} {% if cols_almost_missing_string %} STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }} +{{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str) {{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('') {% endif %} \ No newline at end of file From 000fc7a51c37da31cb26a7257e66d3836d9ebd25 Mon Sep 17 00:00:00 2001 From: tashiro-akira Date: Fri, 21 Jun 2024 14:15:38 +0900 Subject: [PATCH 2/3] fix:Add column renaming to duplicate column names after removing special characters --- sapientml_core/explain/main.py | 41 ++++++++++-- sapientml_core/generator.py | 37 ++++++++++- .../preprocess/default/generator.py | 62 +++++++++++++++++-- .../default/templates/rename_columns.py.jinja | 12 +++- 4 files changed, 137 insertions(+), 15 deletions(-) diff --git a/sapientml_core/explain/main.py b/sapientml_core/explain/main.py index 0b17761..e9f57c5 100644 --- a/sapientml_core/explain/main.py +++ b/sapientml_core/explain/main.py @@ -17,6 +17,7 @@ import pandas as pd from sapientml.params import CancellationToken from sapientml.util.logging import setup_logger +from sapientml_core.preprocess.default.generator import check_cols_has_symbols, remove_symbols, rename_cols from .AutoEDA import EDA from .AutoVisualization import AutoVisualization_Class @@ -81,12 +82,40 @@ def process( if visualization: # Call AutoVisualization to generate visualization codes AV = AutoVisualization_Class() - visualization_code = AV.AutoVisualization( - df=dataframe, - target_columns=target_columns, - problem_type=problem_type, - ignore_columns=ignore_columns, - ) + cols_has_symbols = check_cols_has_symbols(dataframe.columns.to_list()) + no_symbol_columns = [col for col in dataframe.columns.values if col not in cols_has_symbols] + if cols_has_symbols: + rename_dict = {} + org_df_column = dataframe.columns.to_list() + df_columns = list( + dataframe.rename(columns=lambda col: remove_symbols(col) if col in cols_has_symbols else col).columns + ) + rename_dict = rename_cols(org_df_column, no_symbol_columns, df_columns) + if len(rename_dict) != 0: + col_has_target = [] + for org_column, target in zip(list(rename_dict.keys()), list(rename_dict.values())): + if target in target_columns: + col_has_target.append(org_column) + visualization_code = AV.AutoVisualization( + df=dataframe, + target_columns=col_has_target, + problem_type=problem_type, + ignore_columns=ignore_columns, + ) + else: + visualization_code = AV.AutoVisualization( + df=dataframe, + target_columns=col_has_target, + problem_type=problem_type, + ignore_columns=ignore_columns, + ) + else: + visualization_code = AV.AutoVisualization( + df=dataframe, + target_columns=target_columns, + problem_type=problem_type, + ignore_columns=ignore_columns, + ) else: visualization_code = None diff --git a/sapientml_core/generator.py b/sapientml_core/generator.py index ad5e074..5ac2830 100644 --- a/sapientml_core/generator.py +++ b/sapientml_core/generator.py @@ -222,8 +222,43 @@ def generate_pipeline(self, dataset: Dataset, task: Task): for pipeline in sapientml_results: pipeline.validation = code_block.validation + pipeline.validation pipeline.test = code_block.test + pipeline.test - pipeline.train = code_block.train + pipeline.train pipeline.predict = code_block.predict + pipeline.predict + if "cols_has_symbols" in pipeline.test: + pipeline.test = pipeline.test.replace( + '"feature": feature_train.columns', + '"feature": feature_train.rename(columns=rename_symbol_cols).columns', + ) + pipeline.test = pipeline.test.replace( + "prediction.to_csv", "prediction.rename(columns=rename_symbol_cols).to_csv" + ) + + pipeline.predict = pipeline.predict.replace( + '"feature": feature_train.columns', + '"feature": feature_train.rename(columns=rename_symbol_cols).columns', + ) + pipeline.predict = pipeline.predict.replace( + "prediction.to_csv", "prediction.rename(columns=rename_symbol_cols).to_csv" + ) + + pipeline.validation = pipeline.validation.replace( + '"feature": feature_train.columns', + '"feature": feature_train.rename(columns=rename_symbol_cols).columns', + ) + pipeline.validation = pipeline.validation.replace( + "prediction.to_csv", "prediction.rename(columns=rename_symbol_cols).to_csv" + ) + + def replace_targets(match_obj): + return match_obj[0].replace( + "TARGET_COLUMNS", "[rename_symbol_cols.get(v, v) for v in TARGET_COLUMNS]" + ) + + pat = r"prediction = pd.DataFrame\(y_prob, columns=.?TARGET_COLUMNS.*, index=feature_test.index\)" + pipeline.test = re.sub(pat, replace_targets, pipeline.test) + pipeline.predict = re.sub(pat, replace_targets, pipeline.predict) + pipeline.validation = re.sub(pat, replace_targets, pipeline.validation) + + pipeline.train = code_block.train + pipeline.train result_pipelines.append(pipeline) logger.info("Executing generated pipelines...") diff --git a/sapientml_core/preprocess/default/generator.py b/sapientml_core/preprocess/default/generator.py index 465b984..ee6e41a 100644 --- a/sapientml_core/preprocess/default/generator.py +++ b/sapientml_core/preprocess/default/generator.py @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import collections import os +import random import re from pathlib import Path from typing import Tuple @@ -33,7 +35,7 @@ logger = setup_logger() INHIBITED_SYMBOL_PATTERN = re.compile(r"[\{\}\[\]\",:<'\\]+") - +seedvalue = 4736224 template_env = Environment(loader=FileSystemLoader(f"{os.path.dirname(__file__)}/templates"), trim_blocks=True) @@ -195,6 +197,40 @@ def remove_symbols(column_name: str) -> str: return INHIBITED_SYMBOL_PATTERN.sub("", column_name) +def rename_cols(org_column_name: list, no_symbol_columns: list, df_columns: list): + """Change duplicate column names. + + Parameters + ---------- + org_column_name : list + Column names containing special characters + no_symbol_columns : list + Column names that originally have no special characters + df_columns:list + Column names that originally have no special characters + + Returns + ------- + column_name : dict + Return a non-duplicate dict by renaming a duplicate column name. + + """ + random.seed(seedvalue) + rename_dict = {} + same_column = {k: v for k, v in collections.Counter(df_columns).items() if v > 1 and k in no_symbol_columns} + while len(same_column): + for target, org_column in zip(df_columns, org_column_name): + if target in same_column.keys(): + rename_dict[org_column] = target + str(random.randint(1000, 9999)) + else: + rename_dict[org_column] = target + + df_columns = [rename_dict[col] for col in org_column_name] + same_column = {k: v for k, v in collections.Counter(df_columns).items() if v > 1 and k in no_symbol_columns} + + return rename_dict + + class DefaultPreprocess(CodeBlockGenerator): def __init__(self, **kwargs): self.config = DefaultPreprocessConfig(**kwargs) @@ -230,15 +266,31 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]: logger.warning( f"Symbols that inhibit training and visualization will be removed from column name {str(cols_has_symbols)}." ) + org_df_column = df.columns.values + org_target_columns = list(task.target_columns) + no_symbol_columns = [col for col in df.columns.values if col not in cols_has_symbols] df = df.rename(columns=lambda col: remove_symbols(col) if col in cols_has_symbols else col) + df_columns = df.columns.values task.target_columns = [ remove_symbols(col) if col in cols_has_symbols else col for col in task.target_columns ] + if df.columns.duplicated().any(): + rename_dict = rename_cols(org_df_column, no_symbol_columns, df_columns) + df = df.set_axis(list(rename_dict.values()), axis=1) + task.target_columns = [rename_dict[col] for col in org_target_columns] tpl = template_env.get_template("rename_columns.py.jinja") - code.validation += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols) - code.test += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols) - code.train += _render(tpl, training=True, test=False, cols_has_symbols=cols_has_symbols) - code.predict += _render(tpl, training=False, test=True, cols_has_symbols=cols_has_symbols) + code.validation += _render( + tpl, training=True, test=True, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict + ) + code.test += _render( + tpl, training=True, test=True, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict + ) + code.train += _render( + tpl, training=True, test=False, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict + ) + code.predict += _render( + tpl, training=False, test=True, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict + ) # If None is intentionally inserted in the data, an error occurs, so we have added an action to change None to "np.nan." if df.isin([None]).any(axis=None): diff --git a/sapientml_core/preprocess/default/templates/rename_columns.py.jinja b/sapientml_core/preprocess/default/templates/rename_columns.py.jinja index 7e21706..846e637 100644 --- a/sapientml_core/preprocess/default/templates/rename_columns.py.jinja +++ b/sapientml_core/preprocess/default/templates/rename_columns.py.jinja @@ -2,9 +2,15 @@ import re cols_has_symbols = {{ cols_has_symbols }} inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\]+") +{% if rename_dict %} +rename_symbol_cols = {{ rename_dict }} +{% else %} +rename_symbol_cols = {col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col in cols_has_symbols for col in cols_has_symbols } +{% endif %} {% if training %} -train_dataset = train_dataset.rename(columns=lambda col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col) +train_dataset = train_dataset.rename(columns=rename_symbol_cols) {% endif %} {% if test %} -test_dataset = test_dataset.rename(columns=lambda col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col) -{% endif %} \ No newline at end of file +test_dataset = test_dataset.rename(columns=rename_symbol_cols) +{% endif %} +rename_symbol_cols = {v: k for k, v in rename_symbol_cols.items()} \ No newline at end of file From b886944227ae5bd0ee0c29ac521458e44a8873e9 Mon Sep 17 00:00:00 2001 From: tashiro-akira Date: Fri, 28 Jun 2024 10:36:23 +0900 Subject: [PATCH 3/3] fix:Because there is a difference in the index of dataframe, Nan is not mixed in the data --- sapientml_core/templates/model_templates/model.py.jinja | 2 +- sapientml_core/templates/model_templates/model_predict.py.jinja | 2 +- sapientml_core/templates/model_templates/model_test.py.jinja | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sapientml_core/templates/model_templates/model.py.jinja b/sapientml_core/templates/model_templates/model.py.jinja index 747b6ee..f00ef49 100644 --- a/sapientml_core/templates/model_templates/model.py.jinja +++ b/sapientml_core/templates/model_templates/model.py.jinja @@ -51,7 +51,7 @@ y_pred = model.predict(feature_test) y_pred = model.classes_[np.argmax(y_pred, axis=1)].reshape(-1, 1) {% endif %} {% if is_multioutput_classification %} -y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS) +y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS, index=feature_test.index) for column in TARGET_COLUMNS: y_pred_df[column] = label_encoders[column].inverse_transform(y_pred_df[column].astype(int)) y_pred = y_pred_df diff --git a/sapientml_core/templates/model_templates/model_predict.py.jinja b/sapientml_core/templates/model_templates/model_predict.py.jinja index 003f1a5..47f4f0d 100644 --- a/sapientml_core/templates/model_templates/model_predict.py.jinja +++ b/sapientml_core/templates/model_templates/model_predict.py.jinja @@ -16,7 +16,7 @@ with open('target_LabelEncoder.pkl', 'rb') as f: label_encoder = pickle.load(f) {% endif %} {% if is_multioutput_classification %} -y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS) +y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS, index=feature_test.index) for column in TARGET_COLUMNS: y_pred_df[column] = label_encoder[column].inverse_transform(y_pred_df[column].astype(int)) y_pred = y_pred_df diff --git a/sapientml_core/templates/model_templates/model_test.py.jinja b/sapientml_core/templates/model_templates/model_test.py.jinja index 7a47d18..259d7fe 100644 --- a/sapientml_core/templates/model_templates/model_test.py.jinja +++ b/sapientml_core/templates/model_templates/model_test.py.jinja @@ -49,7 +49,7 @@ model.fit(feature_train, target_train) y_pred = model.predict(feature_test) {% if is_multioutput_classification %} -y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS) +y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS, index=feature_test.index) for column in TARGET_COLUMNS: y_pred_df[column] = label_encoders[column].inverse_transform(y_pred_df[column].astype(int)) y_pred = y_pred_df