1212from nltk .translate .bleu_score import SmoothingFunction
1313from sentence_transformers import util
1414import re
15+ from utils import table_column_filter
1516
1617model = init .model
1718
@@ -40,7 +41,7 @@ def read_mapping(mapping_file):
4041 """
4142 Read mapping file and return a set.
4243 """
43- if not os .path .exists (mapping_file ):
44+ if mapping_file is None or not os .path .exists (mapping_file ):
4445 return set ()
4546 with open (mapping_file , 'r' ) as f :
4647 readed = f .readlines ()
@@ -57,20 +58,20 @@ def make_combinations_labels(columns1, columns2, mapping ,type="train"):
5758 Make combinations from columns1 list and columns2 list. Label them using mapping.
5859 """
5960 labels = {}
60- for c1 in columns1 :
61- for c2 in columns2 :
61+ for i , c1 in enumerate ( columns1 ) :
62+ for j , c2 in enumerate ( columns2 ) :
6263 if (c1 , c2 ) in mapping or (c2 , c1 ) in mapping :
63- labels [(c1 , c2 )] = 1
64+ labels [(i , j )] = 1
6465 else :
65- labels [(c1 , c2 )] = 0
66+ labels [(i , j )] = 0
6667 # sample negative labels
6768 if type == "train" :
6869 combinations_count = len (labels )
6970 for i in range (combinations_count * 2 ):
7071 if sum (labels .values ()) >= 0.1 * len (labels ):
7172 break
72- c1 = random .choice (columns1 )
73- c2 = random .choice (columns2 )
73+ c1 = random .choice (range ( len ( columns1 )) )
74+ c2 = random .choice (range ( len ( columns2 )) )
7475 if (c1 , c2 ) in labels and labels [c1 , c2 ] == 0 :
7576 del labels [(c1 , c2 )]
7677 return labels
@@ -93,40 +94,34 @@ def get_instance_similarity(embeddings1, embeddings2):
9394 """
9495 cosine_similarity = np .inner (embeddings1 , embeddings2 ) / (norm (embeddings1 ) * norm (embeddings2 ))
9596 return np .array ([cosine_similarity ])
96-
97- def make_data_from (folder_path ,type = "train" ):
97+
98+ def make_data_from (table1_df , table2_df , mapping_file = None ,type = "train" ):
9899 """
99- Read data from folder and make relational features and labels as a matrix.
100+ Read data from 2 table dataframe, mapping file path and make relational features and labels as a matrix.
100101 """
101- mapping_file = folder_path + "/" + "mapping.txt"
102- table1 = folder_path + "/" + "Table1.csv"
103- table2 = folder_path + "/" + "Table2.csv"
104-
105102 mapping = read_mapping (mapping_file )
106- table1_df = pd .read_csv (table1 )
107- table2_df = pd .read_csv (table2 )
108- columns1 = [c for c in list (table1_df .columns ) if not "Unnamed:" in c ]
109- columns2 = [c for c in list (table2_df .columns ) if not "Unnamed:" in c ]
103+ columns1 = list (table1_df .columns )
104+ columns2 = list (table2_df .columns )
110105
111106 combinations_labels = make_combinations_labels (columns1 , columns2 , mapping ,type )
112- table1_features = make_self_features_from (table1 )
113- table2_features = make_self_features_from (table2 )
107+ table1_features = make_self_features_from (table1_df )
108+ table2_features = make_self_features_from (table2_df )
114109
115110 column_name_embeddings = {preprocess_text (k ):model .encode (preprocess_text (k )) for k in columns1 + columns2 }
116111
117112 additional_feature_num = 6
118113 output_feature_table = np .zeros ((len (combinations_labels ), table1_features .shape [1 ] - 768 + additional_feature_num ), dtype = np .float32 )
119114 output_labels = np .zeros (len (combinations_labels ), dtype = np .int32 )
120115 for i , (combination ,label ) in enumerate (combinations_labels .items ()):
121- c1_name , c2_name = combination
122- c1 = columns1 . index ( c1_name )
123- c2 = columns2 . index ( c2_name )
116+ c1 , c2 = combination
117+ c1_name = columns1 [ c1 ]
118+ c2_name = columns2 [ c2 ]
124119 difference_features_percent = np .abs (table1_features [c1 ] - table2_features [c2 ]) / (table1_features [c1 ] + table2_features [c2 ] + 1e-8 )
125120 c1_name = preprocess_text (c1_name )
126121 c2_name = preprocess_text (c2_name )
127122 colnames_features = get_colnames_features (c1_name , c2_name ,column_name_embeddings )
128123 instance_similarity = get_instance_similarity (table1_features [c1 ][- 768 :], table2_features [c2 ][- 768 :])
129- output_feature_table [i ,:] = np .concatenate ((difference_features_percent [:- 768 ], colnames_features ,instance_similarity ))
124+ output_feature_table [i ,:] = np .concatenate ((difference_features_percent [:- 768 ], colnames_features , instance_similarity ))
130125 output_labels [i ] = label
131126 # add column names mask for training data
132127 if type == "train" and i % 5 == 0 :
@@ -153,10 +148,15 @@ def make_data_from(folder_path,type="train"):
153148 for folder in folder_list :
154149 print ("start extracting data from " + folder )
155150 data_folder = "Training Data/" + folder
156- features ,labels = make_data_from (data_folder ,"train" )
151+ table1_df = pd .read_csv (data_folder + "/Table1.csv" )
152+ table2_df = pd .read_csv (data_folder + "/Table2.csv" )
153+ table1_df = table_column_filter (table1_df )
154+ table2_df = table_column_filter (table2_df )
155+ mapping_file = data_folder + "/mapping.txt"
156+ features ,labels = make_data_from (table1_df , table2_df , mapping_file ,type = "train" )
157157 train_features [folder ] = features
158158 train_labels [folder ] = labels
159- features ,labels = make_data_from (data_folder , "test" )
159+ features ,labels = make_data_from (table1_df , table2_df , mapping_file , type = "test" )
160160 test_features [folder ] = features
161161 test_labels [folder ] = labels
162162
0 commit comments