Add files via upload

fireindark707 · web-flow · commit 17a317e33adf · 2022-04-13T01:43:54.000+08:00
diff --git a/cal_column_similarity.py b/cal_column_similarity.py
@@ -1,13 +1,16 @@
 import init
 from relation_features import make_data_from
-from utils import make_csv_from_json
+from utils import make_csv_from_json,table_column_filter
 from train import test
 import numpy as np
 import pandas as pd
 import xgboost as xgb
 import os
 import argparse
 import time
+from pathlib import Path
+
+this_directory = Path(__file__).parent
 
 parser = argparse.ArgumentParser()
 parser.add_argument("-p","--path", help="path to the folder containing the test data")
@@ -16,7 +19,7 @@
 parser.add_argument("-s", "--strategy", help="one-to-one or many-to-many or one-to-many", default="many-to-many")
 args = parser.parse_args()
 
-def create_similarity_matrix(pth,preds,pred_labels_list,strategy="many-to-many"):
+def create_similarity_matrix(table1_df,table2_df,preds,pred_labels_list,strategy="many-to-many"):
     """
     Create a similarity matrix from the prediction
     """
@@ -27,10 +30,8 @@ def create_similarity_matrix(pth,preds,pred_labels_list,strategy="many-to-many")
     pred_labels = np.mean(pred_labels_list,axis=0)
     pred_labels = np.where(pred_labels>0.5,1,0)
     # read column names
-    df1 = pd.read_csv(pth+"/Table1.csv")
-    df2 = pd.read_csv(pth+"/Table2.csv")
-    df1_cols = df1.columns
-    df2_cols = df2.columns
+    df1_cols = table1_df.columns
+    df2_cols = table2_df.columns
     # create similarity matrix for pred values 
     preds_matrix = np.array(preds).reshape(len(df1_cols),len(df2_cols))
     # create similarity matrix for pred labels
@@ -58,16 +59,30 @@ def create_similarity_matrix(pth,preds,pred_labels_list,strategy="many-to-many")
                 predicted_pairs.append((df_pred.index[i],df_pred.columns[j],df_pred.iloc[i,j]))
     return df_pred,df_pred_labels,predicted_pairs
 
-def schema_matching(pth,model_pth,threshold=None,strategy="many-to-many"):
+def schema_matching(table1_pth,table2_pth,threshold=None,strategy="many-to-many",model_pth=None):
     """
     Do schema matching!
     """
+    if model_pth is None:
+        model_pth = str(this_directory / "model" / "2022-04-12-12-06-32")
     # transform jsonl or json file to csv
-    for file in os.listdir(pth):
-        if file.endswith('.json') or file.endswith('.jsonl'):
-            make_csv_from_json(pth+"/"+file)
+    if table1_pth.endswith('.json') or table1_pth.endswith('.jsonl'):
+        table1_df = make_csv_from_json(table1_pth)
+    else:
+        table1_df = pd.read_csv(table1_pth)
+    if table2_pth.endswith('.json') or table2_pth.endswith('.jsonl'):
+        table2_df = make_csv_from_json(table2_pth)
+    else:
+        table2_df = pd.read_csv(table2_pth)
+
+    # filter columns
+    table1_df = table_column_filter(table1_df)
+    table2_df = table_column_filter(table2_df)
+
+    # extract features
+    features,_ = make_data_from(table1_df, table2_df, type="test")
 
-    features,_ = make_data_from(pth,"test")
+    # load model and predict on features
     preds = []
     pred_labels_list = []
     for i in range(len(os.listdir(model_pth))//2):
@@ -78,18 +93,18 @@ def schema_matching(pth,model_pth,threshold=None,strategy="many-to-many"):
         else:
             with open(model_pth+"/"+str(i)+".threshold",'r') as f:
                 best_threshold = float(f.read())
-        pred,pred_labels = test(bst,best_threshold,features,test_labels=np.ones(len(features)),type="inference")
+        pred, pred_labels = test(bst, best_threshold, features, test_labels=np.ones(len(features)), type="inference")
         preds.append(pred)
         pred_labels_list.append(pred_labels)
         del bst
 
-    df_pred,df_pred_labels,predicted_pairs = create_similarity_matrix(pth,preds,pred_labels_list,strategy=strategy)
+    df_pred,df_pred_labels,predicted_pairs = create_similarity_matrix(table1_df, table2_df, preds, pred_labels_list, strategy=strategy)
     return df_pred,df_pred_labels,predicted_pairs
 
 if __name__ == '__main__':
     start = time.time()
     args.path = args.path.rstrip("/")
-    df_pred,df_pred_labels,predicted_pairs = schema_matching(args.path,args.model,threshold=args.threshold,strategy=args.strategy)
+    df_pred,df_pred_labels,predicted_pairs = schema_matching(args.path+"/Table1.csv",args.path+"/Table2.csv",threshold=args.threshold,strategy=args.strategy,model_pth=args.model)
     df_pred.to_csv(args.path+"/similarity_matrix_value.csv",index=True)
     df_pred_labels.to_csv(args.path+"/similarity_matrix_label.csv",index=True)
 
diff --git a/relation_features.py b/relation_features.py
@@ -12,6 +12,7 @@
 from nltk.translate.bleu_score import SmoothingFunction
 from sentence_transformers import util
 import re
+from utils import table_column_filter
 
 model = init.model
 
@@ -40,7 +41,7 @@ def read_mapping(mapping_file):
     """
     Read mapping file and return a set.
     """
-    if not os.path.exists(mapping_file):
+    if mapping_file is None or not os.path.exists(mapping_file):
         return set()
     with open(mapping_file, 'r') as f:
         readed = f.readlines()
@@ -57,20 +58,20 @@ def make_combinations_labels(columns1, columns2, mapping ,type="train"):
     Make combinations from columns1 list and columns2 list. Label them using mapping.
     """
     labels = {}
-    for c1 in columns1:
-        for c2 in columns2:
+    for i,c1 in enumerate(columns1):
+        for j,c2 in enumerate(columns2):
             if (c1, c2) in mapping or (c2, c1) in mapping:
-                labels[(c1, c2)] = 1
+                labels[(i, j)] = 1
             else:
-                labels[(c1, c2)] = 0
+                labels[(i, j)] = 0
     # sample negative labels
     if type == "train":
         combinations_count = len(labels)
         for i in range(combinations_count*2):
             if sum(labels.values()) >= 0.1 * len(labels):
                 break
-            c1 = random.choice(columns1)
-            c2 = random.choice(columns2)
+            c1 = random.choice(range(len(columns1)))
+            c2 = random.choice(range(len(columns2)))
             if (c1, c2) in labels and labels[c1, c2] == 0:
                 del labels[(c1, c2)]
     return labels
@@ -93,40 +94,34 @@ def get_instance_similarity(embeddings1, embeddings2):
     """
     cosine_similarity = np.inner(embeddings1, embeddings2) / (norm(embeddings1) * norm(embeddings2))
     return np.array([cosine_similarity])
-
-def make_data_from(folder_path,type="train"):
+    
+def make_data_from(table1_df, table2_df,mapping_file=None,type="train"):
     """
-    Read data from folder and make relational features and labels as a matrix.
+    Read data from 2 table dataframe, mapping file path and make relational features and labels as a matrix.
     """
-    mapping_file = folder_path + "/" + "mapping.txt"
-    table1 = folder_path + "/" + "Table1.csv"
-    table2 = folder_path + "/" + "Table2.csv"
-
     mapping = read_mapping(mapping_file)
-    table1_df = pd.read_csv(table1)
-    table2_df = pd.read_csv(table2)
-    columns1 = [c for c in list(table1_df.columns) if not "Unnamed:" in c]
-    columns2 = [c for c in list(table2_df.columns) if not "Unnamed:" in c]
+    columns1 = list(table1_df.columns)
+    columns2 = list(table2_df.columns)
 
     combinations_labels = make_combinations_labels(columns1, columns2, mapping,type)
-    table1_features = make_self_features_from(table1)
-    table2_features = make_self_features_from(table2)
+    table1_features = make_self_features_from(table1_df)
+    table2_features = make_self_features_from(table2_df)
 
     column_name_embeddings = {preprocess_text(k):model.encode(preprocess_text(k)) for k in columns1+columns2}
 
     additional_feature_num = 6
     output_feature_table = np.zeros((len(combinations_labels), table1_features.shape[1] - 768+ additional_feature_num), dtype=np.float32)
     output_labels = np.zeros(len(combinations_labels), dtype=np.int32)
     for i, (combination,label) in enumerate(combinations_labels.items()):
-        c1_name, c2_name = combination
-        c1 = columns1.index(c1_name)
-        c2 = columns2.index(c2_name)
+        c1,c2 = combination
+        c1_name = columns1[c1]
+        c2_name = columns2[c2]
         difference_features_percent = np.abs(table1_features[c1] - table2_features[c2]) / (table1_features[c1] + table2_features[c2] + 1e-8)
         c1_name = preprocess_text(c1_name)
         c2_name = preprocess_text(c2_name)
         colnames_features = get_colnames_features(c1_name, c2_name,column_name_embeddings)
         instance_similarity = get_instance_similarity(table1_features[c1][-768:], table2_features[c2][-768:])
-        output_feature_table[i,:] = np.concatenate((difference_features_percent[:-768], colnames_features,instance_similarity))
+        output_feature_table[i,:] = np.concatenate((difference_features_percent[:-768], colnames_features, instance_similarity))
         output_labels[i] = label
         # add column names mask for training data
         if type == "train" and i % 5 == 0:
@@ -153,10 +148,15 @@ def make_data_from(folder_path,type="train"):
     for folder in folder_list:
         print("start extracting data from " + folder)
         data_folder = "Training Data/" + folder
-        features,labels = make_data_from(data_folder,"train")
+        table1_df = pd.read_csv(data_folder + "/Table1.csv")
+        table2_df = pd.read_csv(data_folder + "/Table2.csv")
+        table1_df = table_column_filter(table1_df)
+        table2_df = table_column_filter(table2_df)
+        mapping_file = data_folder + "/mapping.txt"
+        features,labels = make_data_from(table1_df, table2_df, mapping_file,type="train")
         train_features[folder] = features
         train_labels[folder] = labels
-        features,labels = make_data_from(data_folder,"test")
+        features,labels = make_data_from(table1_df, table2_df, mapping_file,type="test")
         test_features[folder] = features
         test_labels[folder] = labels
 
diff --git a/self_features.py b/self_features.py
@@ -38,6 +38,7 @@ def mainly_numeric(data_list):
     """
     cnt = 0
     for data in data_list:
+        data = str(data)
         data = data.replace(",", "")
         for unit in unit_dict.keys():
             data = data.replace(unit, "")
@@ -171,8 +172,6 @@ def extract_features(data_list):
     Extract some features from the given data(column) or list
     """
     data_list = [d for d in data_list if d == d and d != "--"]
-    if len(data_list) == 0:
-        return 0
     data_types = ("url","numeric","date","string")
     # Classify the data's type, URL or Date or Numeric
     if is_url(data_list):
@@ -204,18 +203,15 @@ def extract_features(data_list):
     output_features = np.concatenate((data_type_feature, num_fts, length_fts, char_fts, deep_fts))
     return output_features
 
-def make_self_features_from(filepath):
+def make_self_features_from(table_df):
     """
     Extracts features from the given table path and returns a feature table.
     """
-    df = load_table(filepath)
     features = None
-    for column in df.columns:
+    for column in table_df.columns:
         if "Unnamed:" in column:
             continue
-        fts = extract_features(df[column])
-        if type(fts) == int:
-            continue
+        fts = extract_features(table_df[column])
         fts = fts.reshape(1, -1)
         if features is None:
             features = fts
@@ -224,5 +220,6 @@ def make_self_features_from(filepath):
     return features
 
 if __name__ == '__main__':
-    features = make_self_features_from("Training Data/pair_7/Table1.csv")
-    print(features)
+    features = make_self_features_from("Test Data/0archive/Table2.csv")
+    print(features)
+    print(features.shape)
diff --git a/utils.py b/utils.py
@@ -29,20 +29,46 @@ def make_csv_from_json(file_path):
     """
     Make csv file from json file.
     """
-    with open(file_path, 'r', encoding='utf-8') as f:
-        data = json.load(f)
+    if file_path.endswith(".json"):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+    elif file_path.endswith(".jsonl"):
+        data = []
+        with open(file_path, 'r') as json_file:
+            json_list = list(json_file)
+        for json_str in json_list:
+            data.append(json.loads(json_str))
 
     # find key_values
     if isinstance(data, dict):
         key_values = find_all_keys_values(data,"")
     elif isinstance(data, list):
-        key_values = find_all_keys_values({"data":data},"")
+        key_values = find_all_keys_values({"TOPLEVEL":data},"TOPLEVEL")
     else:
         raise ValueError('Your input JsonData is not a dictionary or list')
 
-    key_values = {k:v for k,v in key_values.items() if len(v)>1}
+    key_values = {k.replace("TOPLEVEL.",""):v for k,v in key_values.items() if len(v)>1}
 
     df = pd.DataFrame({k:pd.Series(v) for k,v in key_values.items()})
     # save to csv
     save_pth = re.sub(r'\.jsonl?','.csv',file_path)
-    df.to_csv(save_pth, index=False, encoding='utf-8')
+    df.to_csv(save_pth, index=False, encoding='utf-8')
+    return df
+
+def table_column_filter(table_df):
+    """
+    Filter columns that have zero instances or all columns are "--"
+    """
+    original_columns = table_df.columns
+    for column in table_df.columns:
+        column_data = [d for d in list(table_df[column]) if d == d and d != "--"]
+        if len(column_data) <= 1:
+            table_df = table_df.drop(column, axis=1)
+            continue
+        if "Unnamed:" in column:
+            table_df = table_df.drop(column, axis=1)
+            continue
+    remove_columns = list(set(original_columns) - set(table_df.columns))
+    if len(remove_columns) > 0:
+        print("Removed columns:", remove_columns)
+    return table_df