From 5121800c66c20b3b2a4c2620773d00ae551845bf Mon Sep 17 00:00:00 2001
From: tashiro akira <fj1755jk@fujitsu.com>
Date: Tue, 24 Oct 2023 10:40:45 +0900
Subject: [PATCH 1/3] Fixed an error when there were many missing bool columns
 in the input data

Signed-off-by: tashiro akira <fj1755jk@fujitsu.com>
---
 .../preprocessing_templates/fillna-type-string.py.jinja         | 2 ++
 .../preprocessing_templates/fillna-type-string_predict.py.jinja | 1 +
 .../preprocessing_templates/fillna-type-string_train.py.jinja   | 1 +
 3 files changed, 4 insertions(+)

diff --git a/sapientml_core/templates/preprocessing_templates/fillna-type-string.py.jinja b/sapientml_core/templates/preprocessing_templates/fillna-type-string.py.jinja
index 5a8da92..5d0787b 100644
--- a/sapientml_core/templates/preprocessing_templates/fillna-type-string.py.jinja
+++ b/sapientml_core/templates/preprocessing_templates/fillna-type-string.py.jinja
@@ -9,6 +9,8 @@ simple_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
 {% endif %}
 {% if cols_almost_missing_string %}
 STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }}
+{{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str)
+{{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str)
 {{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('')
 {{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('')
 {% endif %}
\ No newline at end of file
diff --git a/sapientml_core/templates/preprocessing_templates/fillna-type-string_predict.py.jinja b/sapientml_core/templates/preprocessing_templates/fillna-type-string_predict.py.jinja
index ef58ba7..472ec11 100644
--- a/sapientml_core/templates/preprocessing_templates/fillna-type-string_predict.py.jinja
+++ b/sapientml_core/templates/preprocessing_templates/fillna-type-string_predict.py.jinja
@@ -7,5 +7,6 @@ STRING_COLS_WITH_MISSING_VALUES = {{ columns }}
 {% endif %}
 {% if cols_almost_missing_string %}
 STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }}
+{{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str)
 {{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('')
 {% endif %}
\ No newline at end of file
diff --git a/sapientml_core/templates/preprocessing_templates/fillna-type-string_train.py.jinja b/sapientml_core/templates/preprocessing_templates/fillna-type-string_train.py.jinja
index 404804b..81d5621 100644
--- a/sapientml_core/templates/preprocessing_templates/fillna-type-string_train.py.jinja
+++ b/sapientml_core/templates/preprocessing_templates/fillna-type-string_train.py.jinja
@@ -11,5 +11,6 @@ with open('simpleimputer-string.pkl', 'wb') as f:
 {% endif %}
 {% if cols_almost_missing_string %}
 STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }}
+{{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str)
 {{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('')
 {% endif %}
\ No newline at end of file

From 000fc7a51c37da31cb26a7257e66d3836d9ebd25 Mon Sep 17 00:00:00 2001
From: tashiro-akira <fj0822cr@fujitsu.com>
Date: Fri, 21 Jun 2024 14:15:38 +0900
Subject: [PATCH 2/3] fix:Add column renaming to duplicate column names after
 removing special characters

---
 sapientml_core/explain/main.py                | 41 ++++++++++--
 sapientml_core/generator.py                   | 37 ++++++++++-
 .../preprocess/default/generator.py           | 62 +++++++++++++++++--
 .../default/templates/rename_columns.py.jinja | 12 +++-
 4 files changed, 137 insertions(+), 15 deletions(-)

diff --git a/sapientml_core/explain/main.py b/sapientml_core/explain/main.py
index 0b17761..e9f57c5 100644
--- a/sapientml_core/explain/main.py
+++ b/sapientml_core/explain/main.py
@@ -17,6 +17,7 @@
 import pandas as pd
 from sapientml.params import CancellationToken
 from sapientml.util.logging import setup_logger
+from sapientml_core.preprocess.default.generator import check_cols_has_symbols, remove_symbols, rename_cols
 
 from .AutoEDA import EDA
 from .AutoVisualization import AutoVisualization_Class
@@ -81,12 +82,40 @@ def process(
     if visualization:
         # Call AutoVisualization to generate visualization codes
         AV = AutoVisualization_Class()
-        visualization_code = AV.AutoVisualization(
-            df=dataframe,
-            target_columns=target_columns,
-            problem_type=problem_type,
-            ignore_columns=ignore_columns,
-        )
+        cols_has_symbols = check_cols_has_symbols(dataframe.columns.to_list())
+        no_symbol_columns = [col for col in dataframe.columns.values if col not in cols_has_symbols]
+        if cols_has_symbols:
+            rename_dict = {}
+            org_df_column = dataframe.columns.to_list()
+            df_columns = list(
+                dataframe.rename(columns=lambda col: remove_symbols(col) if col in cols_has_symbols else col).columns
+            )
+            rename_dict = rename_cols(org_df_column, no_symbol_columns, df_columns)
+            if len(rename_dict) != 0:
+                col_has_target = []
+                for org_column, target in zip(list(rename_dict.keys()), list(rename_dict.values())):
+                    if target in target_columns:
+                        col_has_target.append(org_column)
+                visualization_code = AV.AutoVisualization(
+                    df=dataframe,
+                    target_columns=col_has_target,
+                    problem_type=problem_type,
+                    ignore_columns=ignore_columns,
+                )
+            else:
+                visualization_code = AV.AutoVisualization(
+                    df=dataframe,
+                    target_columns=col_has_target,
+                    problem_type=problem_type,
+                    ignore_columns=ignore_columns,
+                )
+        else:
+            visualization_code = AV.AutoVisualization(
+                df=dataframe,
+                target_columns=target_columns,
+                problem_type=problem_type,
+                ignore_columns=ignore_columns,
+            )
     else:
         visualization_code = None
 
diff --git a/sapientml_core/generator.py b/sapientml_core/generator.py
index ad5e074..5ac2830 100644
--- a/sapientml_core/generator.py
+++ b/sapientml_core/generator.py
@@ -222,8 +222,43 @@ def generate_pipeline(self, dataset: Dataset, task: Task):
         for pipeline in sapientml_results:
             pipeline.validation = code_block.validation + pipeline.validation
             pipeline.test = code_block.test + pipeline.test
-            pipeline.train = code_block.train + pipeline.train
             pipeline.predict = code_block.predict + pipeline.predict
+            if "cols_has_symbols" in pipeline.test:
+                pipeline.test = pipeline.test.replace(
+                    '"feature": feature_train.columns',
+                    '"feature": feature_train.rename(columns=rename_symbol_cols).columns',
+                )
+                pipeline.test = pipeline.test.replace(
+                    "prediction.to_csv", "prediction.rename(columns=rename_symbol_cols).to_csv"
+                )
+
+                pipeline.predict = pipeline.predict.replace(
+                    '"feature": feature_train.columns',
+                    '"feature": feature_train.rename(columns=rename_symbol_cols).columns',
+                )
+                pipeline.predict = pipeline.predict.replace(
+                    "prediction.to_csv", "prediction.rename(columns=rename_symbol_cols).to_csv"
+                )
+
+                pipeline.validation = pipeline.validation.replace(
+                    '"feature": feature_train.columns',
+                    '"feature": feature_train.rename(columns=rename_symbol_cols).columns',
+                )
+                pipeline.validation = pipeline.validation.replace(
+                    "prediction.to_csv", "prediction.rename(columns=rename_symbol_cols).to_csv"
+                )
+
+                def replace_targets(match_obj):
+                    return match_obj[0].replace(
+                        "TARGET_COLUMNS", "[rename_symbol_cols.get(v, v) for v in TARGET_COLUMNS]"
+                    )
+
+                pat = r"prediction = pd.DataFrame\(y_prob, columns=.?TARGET_COLUMNS.*, index=feature_test.index\)"
+                pipeline.test = re.sub(pat, replace_targets, pipeline.test)
+                pipeline.predict = re.sub(pat, replace_targets, pipeline.predict)
+                pipeline.validation = re.sub(pat, replace_targets, pipeline.validation)
+
+            pipeline.train = code_block.train + pipeline.train
             result_pipelines.append(pipeline)
 
         logger.info("Executing generated pipelines...")
diff --git a/sapientml_core/preprocess/default/generator.py b/sapientml_core/preprocess/default/generator.py
index 465b984..ee6e41a 100644
--- a/sapientml_core/preprocess/default/generator.py
+++ b/sapientml_core/preprocess/default/generator.py
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import collections
 import os
+import random
 import re
 from pathlib import Path
 from typing import Tuple
@@ -33,7 +35,7 @@
 logger = setup_logger()
 
 INHIBITED_SYMBOL_PATTERN = re.compile(r"[\{\}\[\]\",:<'\\]+")
-
+seedvalue = 4736224
 
 template_env = Environment(loader=FileSystemLoader(f"{os.path.dirname(__file__)}/templates"), trim_blocks=True)
 
@@ -195,6 +197,40 @@ def remove_symbols(column_name: str) -> str:
     return INHIBITED_SYMBOL_PATTERN.sub("", column_name)
 
 
+def rename_cols(org_column_name: list, no_symbol_columns: list, df_columns: list):
+    """Change duplicate column names.
+
+    Parameters
+    ----------
+    org_column_name : list
+        Column names containing special characters
+    no_symbol_columns : list
+        Column names that originally have no special characters
+    df_columns:list
+        Column names that originally have no special characters
+
+    Returns
+    -------
+    column_name : dict
+        Return a non-duplicate dict by renaming a duplicate column name.
+
+    """
+    random.seed(seedvalue)
+    rename_dict = {}
+    same_column = {k: v for k, v in collections.Counter(df_columns).items() if v > 1 and k in no_symbol_columns}
+    while len(same_column):
+        for target, org_column in zip(df_columns, org_column_name):
+            if target in same_column.keys():
+                rename_dict[org_column] = target + str(random.randint(1000, 9999))
+            else:
+                rename_dict[org_column] = target
+
+        df_columns = [rename_dict[col] for col in org_column_name]
+        same_column = {k: v for k, v in collections.Counter(df_columns).items() if v > 1 and k in no_symbol_columns}
+
+    return rename_dict
+
+
 class DefaultPreprocess(CodeBlockGenerator):
     def __init__(self, **kwargs):
         self.config = DefaultPreprocessConfig(**kwargs)
@@ -230,15 +266,31 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]:
             logger.warning(
                 f"Symbols that inhibit training and visualization will be removed from column name {str(cols_has_symbols)}."
             )
+            org_df_column = df.columns.values
+            org_target_columns = list(task.target_columns)
+            no_symbol_columns = [col for col in df.columns.values if col not in cols_has_symbols]
             df = df.rename(columns=lambda col: remove_symbols(col) if col in cols_has_symbols else col)
+            df_columns = df.columns.values
             task.target_columns = [
                 remove_symbols(col) if col in cols_has_symbols else col for col in task.target_columns
             ]
+            if df.columns.duplicated().any():
+                rename_dict = rename_cols(org_df_column, no_symbol_columns, df_columns)
+                df = df.set_axis(list(rename_dict.values()), axis=1)
+                task.target_columns = [rename_dict[col] for col in org_target_columns]
             tpl = template_env.get_template("rename_columns.py.jinja")
-            code.validation += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols)
-            code.test += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols)
-            code.train += _render(tpl, training=True, test=False, cols_has_symbols=cols_has_symbols)
-            code.predict += _render(tpl, training=False, test=True, cols_has_symbols=cols_has_symbols)
+            code.validation += _render(
+                tpl, training=True, test=True, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict
+            )
+            code.test += _render(
+                tpl, training=True, test=True, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict
+            )
+            code.train += _render(
+                tpl, training=True, test=False, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict
+            )
+            code.predict += _render(
+                tpl, training=False, test=True, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict
+            )
 
         # If None is intentionally inserted in the data, an error occurs, so we have added an action to change None to "np.nan."
         if df.isin([None]).any(axis=None):
diff --git a/sapientml_core/preprocess/default/templates/rename_columns.py.jinja b/sapientml_core/preprocess/default/templates/rename_columns.py.jinja
index 7e21706..846e637 100644
--- a/sapientml_core/preprocess/default/templates/rename_columns.py.jinja
+++ b/sapientml_core/preprocess/default/templates/rename_columns.py.jinja
@@ -2,9 +2,15 @@
 import re
 cols_has_symbols = {{ cols_has_symbols }}
 inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\]+")
+{% if rename_dict %}
+rename_symbol_cols = {{ rename_dict }}
+{% else %}
+rename_symbol_cols = {col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col in cols_has_symbols for col in cols_has_symbols }
+{% endif %}
 {% if training %}
-train_dataset = train_dataset.rename(columns=lambda col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col)
+train_dataset = train_dataset.rename(columns=rename_symbol_cols)
 {% endif %}
 {% if test %}
-test_dataset = test_dataset.rename(columns=lambda col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col)
-{% endif %}
\ No newline at end of file
+test_dataset = test_dataset.rename(columns=rename_symbol_cols)
+{% endif %}
+rename_symbol_cols = {v: k for k, v in rename_symbol_cols.items()}
\ No newline at end of file

From b886944227ae5bd0ee0c29ac521458e44a8873e9 Mon Sep 17 00:00:00 2001
From: tashiro-akira <fj0822cr@fujitsu.com>
Date: Fri, 28 Jun 2024 10:36:23 +0900
Subject: [PATCH 3/3] fix:Because there is a difference in the index of
 dataframe, Nan is not mixed in the data

---
 sapientml_core/templates/model_templates/model.py.jinja         | 2 +-
 sapientml_core/templates/model_templates/model_predict.py.jinja | 2 +-
 sapientml_core/templates/model_templates/model_test.py.jinja    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sapientml_core/templates/model_templates/model.py.jinja b/sapientml_core/templates/model_templates/model.py.jinja
index 747b6ee..f00ef49 100644
--- a/sapientml_core/templates/model_templates/model.py.jinja
+++ b/sapientml_core/templates/model_templates/model.py.jinja
@@ -51,7 +51,7 @@ y_pred = model.predict(feature_test)
 y_pred = model.classes_[np.argmax(y_pred, axis=1)].reshape(-1, 1)
 {% endif %}
 {% if is_multioutput_classification %}
-y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS)
+y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS, index=feature_test.index)
 for column in TARGET_COLUMNS:
     y_pred_df[column] = label_encoders[column].inverse_transform(y_pred_df[column].astype(int))
 y_pred = y_pred_df
diff --git a/sapientml_core/templates/model_templates/model_predict.py.jinja b/sapientml_core/templates/model_templates/model_predict.py.jinja
index 003f1a5..47f4f0d 100644
--- a/sapientml_core/templates/model_templates/model_predict.py.jinja
+++ b/sapientml_core/templates/model_templates/model_predict.py.jinja
@@ -16,7 +16,7 @@ with open('target_LabelEncoder.pkl', 'rb') as f:
     label_encoder = pickle.load(f)
 {% endif %}
 {% if is_multioutput_classification %}
-y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS)
+y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS, index=feature_test.index)
 for column in TARGET_COLUMNS:
     y_pred_df[column] = label_encoder[column].inverse_transform(y_pred_df[column].astype(int))
 y_pred = y_pred_df
diff --git a/sapientml_core/templates/model_templates/model_test.py.jinja b/sapientml_core/templates/model_templates/model_test.py.jinja
index 7a47d18..259d7fe 100644
--- a/sapientml_core/templates/model_templates/model_test.py.jinja
+++ b/sapientml_core/templates/model_templates/model_test.py.jinja
@@ -49,7 +49,7 @@ model.fit(feature_train, target_train)
 y_pred = model.predict(feature_test)
 
 {% if is_multioutput_classification %}
-y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS)
+y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS, index=feature_test.index)
 for column in TARGET_COLUMNS:
     y_pred_df[column] = label_encoders[column].inverse_transform(y_pred_df[column].astype(int))
 y_pred = y_pred_df