Skip to content

Commit 17a317e

Browse files
Add files via upload
1 parent eeae6a3 commit 17a317e

File tree

4 files changed

+93
-55
lines changed

4 files changed

+93
-55
lines changed

cal_column_similarity.py

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
import init
22
from relation_features import make_data_from
3-
from utils import make_csv_from_json
3+
from utils import make_csv_from_json,table_column_filter
44
from train import test
55
import numpy as np
66
import pandas as pd
77
import xgboost as xgb
88
import os
99
import argparse
1010
import time
11+
from pathlib import Path
12+
13+
this_directory = Path(__file__).parent
1114

1215
parser = argparse.ArgumentParser()
1316
parser.add_argument("-p","--path", help="path to the folder containing the test data")
@@ -16,7 +19,7 @@
1619
parser.add_argument("-s", "--strategy", help="one-to-one or many-to-many or one-to-many", default="many-to-many")
1720
args = parser.parse_args()
1821

19-
def create_similarity_matrix(pth,preds,pred_labels_list,strategy="many-to-many"):
22+
def create_similarity_matrix(table1_df,table2_df,preds,pred_labels_list,strategy="many-to-many"):
2023
"""
2124
Create a similarity matrix from the prediction
2225
"""
@@ -27,10 +30,8 @@ def create_similarity_matrix(pth,preds,pred_labels_list,strategy="many-to-many")
2730
pred_labels = np.mean(pred_labels_list,axis=0)
2831
pred_labels = np.where(pred_labels>0.5,1,0)
2932
# read column names
30-
df1 = pd.read_csv(pth+"/Table1.csv")
31-
df2 = pd.read_csv(pth+"/Table2.csv")
32-
df1_cols = df1.columns
33-
df2_cols = df2.columns
33+
df1_cols = table1_df.columns
34+
df2_cols = table2_df.columns
3435
# create similarity matrix for pred values
3536
preds_matrix = np.array(preds).reshape(len(df1_cols),len(df2_cols))
3637
# create similarity matrix for pred labels
@@ -58,16 +59,30 @@ def create_similarity_matrix(pth,preds,pred_labels_list,strategy="many-to-many")
5859
predicted_pairs.append((df_pred.index[i],df_pred.columns[j],df_pred.iloc[i,j]))
5960
return df_pred,df_pred_labels,predicted_pairs
6061

61-
def schema_matching(pth,model_pth,threshold=None,strategy="many-to-many"):
62+
def schema_matching(table1_pth,table2_pth,threshold=None,strategy="many-to-many",model_pth=None):
6263
"""
6364
Do schema matching!
6465
"""
66+
if model_pth is None:
67+
model_pth = str(this_directory / "model" / "2022-04-12-12-06-32")
6568
# transform jsonl or json file to csv
66-
for file in os.listdir(pth):
67-
if file.endswith('.json') or file.endswith('.jsonl'):
68-
make_csv_from_json(pth+"/"+file)
69+
if table1_pth.endswith('.json') or table1_pth.endswith('.jsonl'):
70+
table1_df = make_csv_from_json(table1_pth)
71+
else:
72+
table1_df = pd.read_csv(table1_pth)
73+
if table2_pth.endswith('.json') or table2_pth.endswith('.jsonl'):
74+
table2_df = make_csv_from_json(table2_pth)
75+
else:
76+
table2_df = pd.read_csv(table2_pth)
77+
78+
# filter columns
79+
table1_df = table_column_filter(table1_df)
80+
table2_df = table_column_filter(table2_df)
81+
82+
# extract features
83+
features,_ = make_data_from(table1_df, table2_df, type="test")
6984

70-
features,_ = make_data_from(pth,"test")
85+
# load model and predict on features
7186
preds = []
7287
pred_labels_list = []
7388
for i in range(len(os.listdir(model_pth))//2):
@@ -78,18 +93,18 @@ def schema_matching(pth,model_pth,threshold=None,strategy="many-to-many"):
7893
else:
7994
with open(model_pth+"/"+str(i)+".threshold",'r') as f:
8095
best_threshold = float(f.read())
81-
pred,pred_labels = test(bst,best_threshold,features,test_labels=np.ones(len(features)),type="inference")
96+
pred, pred_labels = test(bst, best_threshold, features, test_labels=np.ones(len(features)), type="inference")
8297
preds.append(pred)
8398
pred_labels_list.append(pred_labels)
8499
del bst
85100

86-
df_pred,df_pred_labels,predicted_pairs = create_similarity_matrix(pth,preds,pred_labels_list,strategy=strategy)
101+
df_pred,df_pred_labels,predicted_pairs = create_similarity_matrix(table1_df, table2_df, preds, pred_labels_list, strategy=strategy)
87102
return df_pred,df_pred_labels,predicted_pairs
88103

89104
if __name__ == '__main__':
90105
start = time.time()
91106
args.path = args.path.rstrip("/")
92-
df_pred,df_pred_labels,predicted_pairs = schema_matching(args.path,args.model,threshold=args.threshold,strategy=args.strategy)
107+
df_pred,df_pred_labels,predicted_pairs = schema_matching(args.path+"/Table1.csv",args.path+"/Table2.csv",threshold=args.threshold,strategy=args.strategy,model_pth=args.model)
93108
df_pred.to_csv(args.path+"/similarity_matrix_value.csv",index=True)
94109
df_pred_labels.to_csv(args.path+"/similarity_matrix_label.csv",index=True)
95110

relation_features.py

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from nltk.translate.bleu_score import SmoothingFunction
1313
from sentence_transformers import util
1414
import re
15+
from utils import table_column_filter
1516

1617
model = init.model
1718

@@ -40,7 +41,7 @@ def read_mapping(mapping_file):
4041
"""
4142
Read mapping file and return a set.
4243
"""
43-
if not os.path.exists(mapping_file):
44+
if mapping_file is None or not os.path.exists(mapping_file):
4445
return set()
4546
with open(mapping_file, 'r') as f:
4647
readed = f.readlines()
@@ -57,20 +58,20 @@ def make_combinations_labels(columns1, columns2, mapping ,type="train"):
5758
Make combinations from columns1 list and columns2 list. Label them using mapping.
5859
"""
5960
labels = {}
60-
for c1 in columns1:
61-
for c2 in columns2:
61+
for i,c1 in enumerate(columns1):
62+
for j,c2 in enumerate(columns2):
6263
if (c1, c2) in mapping or (c2, c1) in mapping:
63-
labels[(c1, c2)] = 1
64+
labels[(i, j)] = 1
6465
else:
65-
labels[(c1, c2)] = 0
66+
labels[(i, j)] = 0
6667
# sample negative labels
6768
if type == "train":
6869
combinations_count = len(labels)
6970
for i in range(combinations_count*2):
7071
if sum(labels.values()) >= 0.1 * len(labels):
7172
break
72-
c1 = random.choice(columns1)
73-
c2 = random.choice(columns2)
73+
c1 = random.choice(range(len(columns1)))
74+
c2 = random.choice(range(len(columns2)))
7475
if (c1, c2) in labels and labels[c1, c2] == 0:
7576
del labels[(c1, c2)]
7677
return labels
@@ -93,40 +94,34 @@ def get_instance_similarity(embeddings1, embeddings2):
9394
"""
9495
cosine_similarity = np.inner(embeddings1, embeddings2) / (norm(embeddings1) * norm(embeddings2))
9596
return np.array([cosine_similarity])
96-
97-
def make_data_from(folder_path,type="train"):
97+
98+
def make_data_from(table1_df, table2_df,mapping_file=None,type="train"):
9899
"""
99-
Read data from folder and make relational features and labels as a matrix.
100+
Read data from 2 table dataframe, mapping file path and make relational features and labels as a matrix.
100101
"""
101-
mapping_file = folder_path + "/" + "mapping.txt"
102-
table1 = folder_path + "/" + "Table1.csv"
103-
table2 = folder_path + "/" + "Table2.csv"
104-
105102
mapping = read_mapping(mapping_file)
106-
table1_df = pd.read_csv(table1)
107-
table2_df = pd.read_csv(table2)
108-
columns1 = [c for c in list(table1_df.columns) if not "Unnamed:" in c]
109-
columns2 = [c for c in list(table2_df.columns) if not "Unnamed:" in c]
103+
columns1 = list(table1_df.columns)
104+
columns2 = list(table2_df.columns)
110105

111106
combinations_labels = make_combinations_labels(columns1, columns2, mapping,type)
112-
table1_features = make_self_features_from(table1)
113-
table2_features = make_self_features_from(table2)
107+
table1_features = make_self_features_from(table1_df)
108+
table2_features = make_self_features_from(table2_df)
114109

115110
column_name_embeddings = {preprocess_text(k):model.encode(preprocess_text(k)) for k in columns1+columns2}
116111

117112
additional_feature_num = 6
118113
output_feature_table = np.zeros((len(combinations_labels), table1_features.shape[1] - 768+ additional_feature_num), dtype=np.float32)
119114
output_labels = np.zeros(len(combinations_labels), dtype=np.int32)
120115
for i, (combination,label) in enumerate(combinations_labels.items()):
121-
c1_name, c2_name = combination
122-
c1 = columns1.index(c1_name)
123-
c2 = columns2.index(c2_name)
116+
c1,c2 = combination
117+
c1_name = columns1[c1]
118+
c2_name = columns2[c2]
124119
difference_features_percent = np.abs(table1_features[c1] - table2_features[c2]) / (table1_features[c1] + table2_features[c2] + 1e-8)
125120
c1_name = preprocess_text(c1_name)
126121
c2_name = preprocess_text(c2_name)
127122
colnames_features = get_colnames_features(c1_name, c2_name,column_name_embeddings)
128123
instance_similarity = get_instance_similarity(table1_features[c1][-768:], table2_features[c2][-768:])
129-
output_feature_table[i,:] = np.concatenate((difference_features_percent[:-768], colnames_features,instance_similarity))
124+
output_feature_table[i,:] = np.concatenate((difference_features_percent[:-768], colnames_features, instance_similarity))
130125
output_labels[i] = label
131126
# add column names mask for training data
132127
if type == "train" and i % 5 == 0:
@@ -153,10 +148,15 @@ def make_data_from(folder_path,type="train"):
153148
for folder in folder_list:
154149
print("start extracting data from " + folder)
155150
data_folder = "Training Data/" + folder
156-
features,labels = make_data_from(data_folder,"train")
151+
table1_df = pd.read_csv(data_folder + "/Table1.csv")
152+
table2_df = pd.read_csv(data_folder + "/Table2.csv")
153+
table1_df = table_column_filter(table1_df)
154+
table2_df = table_column_filter(table2_df)
155+
mapping_file = data_folder + "/mapping.txt"
156+
features,labels = make_data_from(table1_df, table2_df, mapping_file,type="train")
157157
train_features[folder] = features
158158
train_labels[folder] = labels
159-
features,labels = make_data_from(data_folder,"test")
159+
features,labels = make_data_from(table1_df, table2_df, mapping_file,type="test")
160160
test_features[folder] = features
161161
test_labels[folder] = labels
162162

self_features.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ def mainly_numeric(data_list):
3838
"""
3939
cnt = 0
4040
for data in data_list:
41+
data = str(data)
4142
data = data.replace(",", "")
4243
for unit in unit_dict.keys():
4344
data = data.replace(unit, "")
@@ -171,8 +172,6 @@ def extract_features(data_list):
171172
Extract some features from the given data(column) or list
172173
"""
173174
data_list = [d for d in data_list if d == d and d != "--"]
174-
if len(data_list) == 0:
175-
return 0
176175
data_types = ("url","numeric","date","string")
177176
# Classify the data's type, URL or Date or Numeric
178177
if is_url(data_list):
@@ -204,18 +203,15 @@ def extract_features(data_list):
204203
output_features = np.concatenate((data_type_feature, num_fts, length_fts, char_fts, deep_fts))
205204
return output_features
206205

207-
def make_self_features_from(filepath):
206+
def make_self_features_from(table_df):
208207
"""
209208
Extracts features from the given table path and returns a feature table.
210209
"""
211-
df = load_table(filepath)
212210
features = None
213-
for column in df.columns:
211+
for column in table_df.columns:
214212
if "Unnamed:" in column:
215213
continue
216-
fts = extract_features(df[column])
217-
if type(fts) == int:
218-
continue
214+
fts = extract_features(table_df[column])
219215
fts = fts.reshape(1, -1)
220216
if features is None:
221217
features = fts
@@ -224,5 +220,6 @@ def make_self_features_from(filepath):
224220
return features
225221

226222
if __name__ == '__main__':
227-
features = make_self_features_from("Training Data/pair_7/Table1.csv")
228-
print(features)
223+
features = make_self_features_from("Test Data/0archive/Table2.csv")
224+
print(features)
225+
print(features.shape)

utils.py

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,20 +29,46 @@ def make_csv_from_json(file_path):
2929
"""
3030
Make csv file from json file.
3131
"""
32-
with open(file_path, 'r', encoding='utf-8') as f:
33-
data = json.load(f)
32+
if file_path.endswith(".json"):
33+
with open(file_path, 'r', encoding='utf-8') as f:
34+
data = json.load(f)
35+
elif file_path.endswith(".jsonl"):
36+
data = []
37+
with open(file_path, 'r') as json_file:
38+
json_list = list(json_file)
39+
for json_str in json_list:
40+
data.append(json.loads(json_str))
3441

3542
# find key_values
3643
if isinstance(data, dict):
3744
key_values = find_all_keys_values(data,"")
3845
elif isinstance(data, list):
39-
key_values = find_all_keys_values({"data":data},"")
46+
key_values = find_all_keys_values({"TOPLEVEL":data},"TOPLEVEL")
4047
else:
4148
raise ValueError('Your input JsonData is not a dictionary or list')
4249

43-
key_values = {k:v for k,v in key_values.items() if len(v)>1}
50+
key_values = {k.replace("TOPLEVEL.",""):v for k,v in key_values.items() if len(v)>1}
4451

4552
df = pd.DataFrame({k:pd.Series(v) for k,v in key_values.items()})
4653
# save to csv
4754
save_pth = re.sub(r'\.jsonl?','.csv',file_path)
48-
df.to_csv(save_pth, index=False, encoding='utf-8')
55+
df.to_csv(save_pth, index=False, encoding='utf-8')
56+
return df
57+
58+
def table_column_filter(table_df):
59+
"""
60+
Filter columns that have zero instances or all columns are "--"
61+
"""
62+
original_columns = table_df.columns
63+
for column in table_df.columns:
64+
column_data = [d for d in list(table_df[column]) if d == d and d != "--"]
65+
if len(column_data) <= 1:
66+
table_df = table_df.drop(column, axis=1)
67+
continue
68+
if "Unnamed:" in column:
69+
table_df = table_df.drop(column, axis=1)
70+
continue
71+
remove_columns = list(set(original_columns) - set(table_df.columns))
72+
if len(remove_columns) > 0:
73+
print("Removed columns:", remove_columns)
74+
return table_df

0 commit comments

Comments
 (0)