From 907a4c189cfea03ef460f0ffd70a35fa5ea273e1 Mon Sep 17 00:00:00 2001 From: MefAldemisov Date: Wed, 9 Feb 2022 13:21:03 +0300 Subject: [PATCH 1/9] Code-bert based model draft --- requirements.txt | 3 + src/bert_attempts/AccuracyEvaluator.py | 128 +++++++++++++ src/bert_attempts/BertBased.py | 243 +++++++++++++++++++++++++ 3 files changed, 374 insertions(+) create mode 100644 src/bert_attempts/AccuracyEvaluator.py create mode 100644 src/bert_attempts/BertBased.py diff --git a/requirements.txt b/requirements.txt index f5315eb..cb6c546 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,6 @@ tqdm==4.58.0 sentencepiece==0.1.95 pydot==1.4.2 tensorflow-text==2.5.0 +torch==1.10.0 +torchvision==0.11.1 +transformers==4.15.0 diff --git a/src/bert_attempts/AccuracyEvaluator.py b/src/bert_attempts/AccuracyEvaluator.py new file mode 100644 index 0000000..a114301 --- /dev/null +++ b/src/bert_attempts/AccuracyEvaluator.py @@ -0,0 +1,128 @@ +import io +import datetime +import numpy as np +import torch +import matplotlib.pyplot as plt + +from typing import List +from sklearn.manifold import TSNE +from sklearn.metrics import accuracy_score +from sklearn.neighbors import KNeighborsClassifier + +class AccuracyEvaluator: + + def __init__(self, + # X_train: np.ndarray, + X_test: np.ndarray, + # y_train: np.ndarray, + y_test: np.ndarray, + threshold: float = 0.1, + input_size: int = 500, + authors: List = list(range(20))): + """ + Parameters: + - `X_train`,` X_test` - np.arrays with data (tokens) + - `y_train`, `y_test` - np.arrays, labels (numerical representation of authors) + + - `threshold` - alpha parameter of the triplet loss, threshold for the classification's distance + - `input_size` - amount of tokens in one file + - `authors` - int, prediction stage requires the all-with-all comparison (O(n^2)), + that is why, it is reduced for plotting and evaluating + """ + super().__init__() + self.threshold = threshold + self.input_size = input_size + # x-y preprocessing + self.authors = authors + + def select_authors(initial_x, initial_y): + index = np.where(np.isin(initial_y, self.authors))[0] + new_x, new_y = map(lambda a: a[index], [initial_x, initial_y]) + return new_x, new_y + + # simple_x_train, simple_y_train = select_authors(X_train, y_train) + simple_x_test, simple_y_test = select_authors(X_test, y_test) + + self.data = { + "simple": { + # "train": [simple_x_train, simple_y_train], + "test": [simple_x_test, simple_y_test] + }, + "full": { + # "train": [X_train, y_train], + "test": [X_test, y_test] + } + } + + # counter initialization + self.n = 0 + + @staticmethod + def _plot_to_image(figure): + # https://www.tensorflow.org/tensorboard/image_summaries + buf = io.BytesIO() + plt.savefig(buf, format="png") + plt.close(figure) + buf.seek(0) + + def apply_dimensionality_reduction(self, + transformed_x: np.ndarray, + y: np.ndarray, + epoch: int, + is_test: bool): + vectors = TSNE(n_components=2) + x_pca = vectors.fit_transform(transformed_x) + figure = plt.figure(figsize=(10, 8)) + plt.title("Step {} (epoch {})".format(self.n, epoch)) + for developer in self.authors: + indexes = np.where(y == developer)[0] + plt.plot(x_pca[indexes, 0], x_pca[indexes, 1], "o", ms=5) + # save as file + plt.savefig("../outputs/tsne_{}/tsne_{}.png".format('bert', self.n)) + # log to tensorboard + # image = self._plot_to_image(figure) + # writer = self.test_summary_writer if is_test else self.train_summary_writer + # with writer.as_default(): + # tf.summary.image("Distribution of authors", image, step=self.n) + + plt.close("all") + + def get_acc(self, + model, + x: np.ndarray, + y: np.ndarray, + epoch: int, + is_test: bool, + dim_red: True) -> float: + + transformed_x = model(x) + knn = KNeighborsClassifier().fit(transformed_x, y) + predictions = knn.predict(transformed_x) + accuracy = accuracy_score(y_true=y, y_pred=predictions) + # if dim_red: + # self.apply_dimensionality_reduction(transformed_x, y, epoch, is_test) + return accuracy + + def _writer(self, + x, + y, + model, + epoch: int, + is_test: bool, + is_simple: bool) -> float: + + accuracy = self.get_acc(model, x, y, epoch, is_test, is_simple) + return accuracy + + def on_epoch_end(self, + model, + epoch: int, + loss: float): + + # astr = self._writer(*self.data["simple"]["train"], model, epoch, False, True) + aste = self._writer(*self.data["simple"]["test"], model, epoch, True, True) + afte = self._writer(*self.data["full"]["test"], model, epoch, True, False) + + print(loss, aste, afte) + self.n += 1 + return aste, afte diff --git a/src/bert_attempts/BertBased.py b/src/bert_attempts/BertBased.py new file mode 100644 index 0000000..2dcb1e5 --- /dev/null +++ b/src/bert_attempts/BertBased.py @@ -0,0 +1,243 @@ +from transformers import RobertaTokenizer, RobertaModel +import tqdm +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +import torch.optim as optim +from AccuracyEvaluator import AccuracyEvaluator +from sklearn.neighbors import BallTree +from sklearn.preprocessing import LabelEncoder + +# with reference to https://www.kaggle.com/hirotaka0122/triplet-loss-with-pytorch + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +if device.type == "cuda": + torch.cuda.get_device_name() + +# -------------------------- constants +df_path = '../../inputs/processed_dfs/cpp_9_tasks_2016.csv' +tmp_dataset_dir = "../../inputs/preprocessed_jsons/" +tmp_dataset_filename = tmp_dataset_dir + 'bert' + "_train.json" + +INPUT_SIZE = 512 # 514 tokens, maximum for bert +OUTPUT_SIZE = 256 +N_EPOCHS = 100 +BATCH_SIZE = 16 + +# -------------------------- load data +df = pd.read_csv(df_path) +# df = df.drop(columns=["round", "task", "solution", "file", +# "full_path", "Unnamed: 0.1", "Unnamed: 0", "lang"]) +# df["n_lines"] = df.flines.apply(lambda x: str(x).count("\n")) +# df = df[(df.n_lines > 0)] + + +def _insert_tokens(x: str): + x = x.replace("\n", " NLN ") + x = x.replace("\t", " TAB ") + x = x.replace(" ", " SPC ") + return x + +df.flines = df.flines.apply(_insert_tokens) + +# load tokenizer +tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm") +df.index = np.arange(len(df)) +le = LabelEncoder() +df.user = le.fit_transform(df.user) +df['tokens'] = df.flines.apply(lambda x: + tokenizer.convert_tokens_to_ids( + tokenizer.tokenize(x))) + +dataset = df[["user", "tokens", "task"]] +# shuffle dataset +dataset = dataset.sample(frac=1) + +X = dataset.tokens.values + +def fillZeros(arr): + arr = np.array(arr) + if INPUT_SIZE > arr.shape[0]: + arr = np.pad(arr, (0, INPUT_SIZE - arr.shape[0]), 'constant') + else: + arr = arr[:INPUT_SIZE] + return arr.reshape(INPUT_SIZE, 1).tolist() + +X = np.array([fillZeros(x) for x in X]) +X = X.reshape((-1, INPUT_SIZE)) +y = np.array(dataset.user) +tasks = np.array(dataset.task) +train_indexes = np.where(tasks < 7)[0] +test_indexes = np.where(tasks >= 7)[0] +X_train, X_test = X[train_indexes], X[test_indexes] +y_train, y_test = y[train_indexes], y[test_indexes] # 244 unique person + +# -------------------------- model architecture + +# let's do just a simple thing + +# 1. embedding from bert -> INPUT_SIZE * 768 +# 2. convolution (5*768) +# 3. fully-connected 500 +# 4. fully connected 100 + +# 1. pretrained part + +# train loader +class GCJ: + def __init__(self, X_train, y_train, batch_size = BATCH_SIZE): + self.x = X_train + self.y = y_train + self.batch_size = batch_size + + def batch_generator(self, model, tree): + n_positive = self.batch_size // 2 + anchor_index = np.random.choice(self.y.shape[0], 1) + y_anchor = y[anchor_index] + positive_indexes = np.where(self.y == y_anchor)[0] + n_same = positive_indexes.shape[0] + positive_indexes = positive_indexes[:n_positive] + k = self.batch_size - positive_indexes.shape[0] + + if tree is not None: + query = model(self.x[anchor_index]) + query_res = tree.query(query, self.batch_size+n_same, return_distance=False)[0] + negative_indexes = np.array([neighbour_index for neighbour_index in query_res + if self.y[neighbour_index] != y_anchor])[:k] + else: # the first batch generation + negative_indexes = np.where(self.y != y_anchor)[0] + np.random.shuffle(negative_indexes) + negative_indexes = negative_indexes[:k] + + local_x = self.x.reshape((-1, INPUT_SIZE)) + + reduced_indexes = map(lambda indexes: np.random.choice(indexes, self.batch_size), + [positive_indexes, negative_indexes]) + + positive, negative = map(lambda i: local_x[i], reduced_indexes) + anchor = np.array([local_x[anchor_index] for _ in range(self.batch_size)]).reshape((-1, INPUT_SIZE)) + + return anchor, positive, negative + + def generator(self, model, tree): + while True: + yield self.batch_generator(model, tree) + + +# model + +class Network(nn.Module): + def __init__(self): + super(Network, self).__init__() + # conv_sizes = [2, 4, 16] + k_size = 8 + self.pool_size = INPUT_SIZE - k_size + 1 # output for conv + self.channels = 4 + self.conv = nn.Sequential( + nn.Conv2d(1, self.channels, kernel_size=(k_size, 768),), + nn.ReLU(), + ) + # for size in conv_sizes + # ] + self.fc = nn.Sequential( + nn.Linear(self.pool_size*self.channels, INPUT_SIZE), + nn.ReLU(), + nn.Linear(INPUT_SIZE, OUTPUT_SIZE), + nn.ReLU() + ) + + def forward(self, x): + # array = [conv(x) for conv in self.conv] + x = torch.reshape(x, (-1, 1, 512, 768)) + x = self.conv(x) + x = x.view(-1, self.channels*self.pool_size) + # x = torch.concat(array, dim=1) + x = self.fc(x) + return x + +# + +# configs +class TripletLoss(nn.Module): + def __init__(self, margin=1.0): + super(TripletLoss, self).__init__() + self.margin = margin + + def calc_euclidean(self, x1, x2): + return (x1 - x2).pow(2).sum(1) + + def forward(self, anchor: torch.Tensor, positive: torch.Tensor, negative: torch.Tensor) -> torch.Tensor: + distance_positive = self.calc_euclidean(anchor, positive) + distance_negative = self.calc_euclidean(anchor, negative) + losses = torch.relu(distance_positive - distance_negative + self.margin) + + return losses.mean() + +def init_weights(m): + if isinstance(m, nn.Conv2d): + torch.nn.init.xavier_normal_(m.weight) + + +data_loader = GCJ(X_train, y_train) + +embedding_model = RobertaModel.from_pretrained("microsoft/codebert-base") + +model = Network() +model.apply(init_weights) +model = torch.jit.script(model).to(device) + +tree = None # default value + +optimizer = optim.Adam(model.parameters(), lr=0.001) +criterion = torch.jit.script(TripletLoss()) + + +# test_emb = embedding_model(torch.from_numpy(X_test)).last_hidden_state +x_emb = [] +for i in tqdm.tqdm(range(0, X_test.shape[0], BATCH_SIZE)): + xs = X_train[i: i+BATCH_SIZE] + new_xs = embedding_model(torch.from_numpy(xs)).last_hidden_state + x_emb = [*x_emb, *new_xs] + +x_emb = np.array(x_emb) +# Connected to pydev debugger (build 213.5744.248) +# 16%|█▌ | 5/31 [03:37<18:48, 43.42s/it]/usr/local/Cellar/python@3.9/3.9.7/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown +# warnings.warn('resource_tracker: There appear to be %d ' +# +# Process finished with exit code 137 (interrupted by signal 9: SIGKILL) +callback = AccuracyEvaluator(x_emb, y_test, input_size=768) + +# training loop +model.train() +for epoch in tqdm.tqdm(range(N_EPOCHS), desc="Epochs"): + running_loss = [] + for step in enumerate(tqdm.tqdm(range(len(np.unique(y_train))), desc="Training", leave=False)): + anchor, positive, negative = data_loader.batch_generator(model, tree) + anchor = embedding_model(anchor).last_hidden_state + positive = embedding_model(positive).last_hidden_state + negative = embedding_model(negative).last_hidden_state + + optimizer.zero_grad() + + anchor_out = model(anchor) + positive_out = model(positive) + negative_out = model(negative) + + loss = criterion(anchor_out, positive_out, negative_out) + loss.backward() + optimizer.step() + + # predictions = model(x_emb) + # tree = BallTree(predictions, metric="euclidean") + + current_loss = loss.cpu().detach().numpy() + print(current_loss) + running_loss.append(current_loss) + + # callback (accuracy) + metrics = callback.on_epoch_end(model, epoch, current_loss) + print(metrics) + + print("Epoch: {}/{} - Loss: {:.4f}".format(epoch + 1, N_EPOCHS, np.mean(running_loss))) \ No newline at end of file From 93839ca04b8773a81934ddf0697b0cea45e46c1c Mon Sep 17 00:00:00 2001 From: MefAldemisov Date: Wed, 9 Feb 2022 18:10:14 +0300 Subject: [PATCH 2/9] Simple model is ready for training --- src/bert_attempts/AccuracyEvaluator.py | 11 +- src/bert_attempts/BertBased.py | 150 ++++++++++++++----------- 2 files changed, 90 insertions(+), 71 deletions(-) diff --git a/src/bert_attempts/AccuracyEvaluator.py b/src/bert_attempts/AccuracyEvaluator.py index a114301..2171d53 100644 --- a/src/bert_attempts/AccuracyEvaluator.py +++ b/src/bert_attempts/AccuracyEvaluator.py @@ -13,7 +13,7 @@ class AccuracyEvaluator: def __init__(self, # X_train: np.ndarray, - X_test: np.ndarray, + X_test, # y_train: np.ndarray, y_test: np.ndarray, threshold: float = 0.1, @@ -37,7 +37,8 @@ def __init__(self, def select_authors(initial_x, initial_y): index = np.where(np.isin(initial_y, self.authors))[0] - new_x, new_y = map(lambda a: a[index], [initial_x, initial_y]) + new_x = [initial_x[i] for i in index] + new_y = initial_y[index] return new_x, new_y # simple_x_train, simple_y_train = select_authors(X_train, y_train) @@ -89,13 +90,13 @@ def apply_dimensionality_reduction(self, def get_acc(self, model, - x: np.ndarray, + x, y: np.ndarray, epoch: int, is_test: bool, dim_red: True) -> float: - - transformed_x = model(x) + with torch.no_grad(): + transformed_x = model(torch.cat(x)) knn = KNeighborsClassifier().fit(transformed_x, y) predictions = knn.predict(transformed_x) accuracy = accuracy_score(y_true=y, y_pred=predictions) diff --git a/src/bert_attempts/BertBased.py b/src/bert_attempts/BertBased.py index 2dcb1e5..533d816 100644 --- a/src/bert_attempts/BertBased.py +++ b/src/bert_attempts/BertBased.py @@ -8,7 +8,6 @@ from AccuracyEvaluator import AccuracyEvaluator from sklearn.neighbors import BallTree from sklearn.preprocessing import LabelEncoder - # with reference to https://www.kaggle.com/hirotaka0122/triplet-loss-with-pytorch device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -21,58 +20,90 @@ tmp_dataset_dir = "../../inputs/preprocessed_jsons/" tmp_dataset_filename = tmp_dataset_dir + 'bert' + "_train.json" -INPUT_SIZE = 512 # 514 tokens, maximum for bert +INPUT_SIZE = 512 # 514 tokens, maximum for bert OUTPUT_SIZE = 256 N_EPOCHS = 100 BATCH_SIZE = 16 # -------------------------- load data -df = pd.read_csv(df_path) -# df = df.drop(columns=["round", "task", "solution", "file", -# "full_path", "Unnamed: 0.1", "Unnamed: 0", "lang"]) -# df["n_lines"] = df.flines.apply(lambda x: str(x).count("\n")) -# df = df[(df.n_lines > 0)] - - -def _insert_tokens(x: str): - x = x.replace("\n", " NLN ") - x = x.replace("\t", " TAB ") - x = x.replace(" ", " SPC ") - return x - -df.flines = df.flines.apply(_insert_tokens) - -# load tokenizer -tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm") -df.index = np.arange(len(df)) -le = LabelEncoder() -df.user = le.fit_transform(df.user) -df['tokens'] = df.flines.apply(lambda x: - tokenizer.convert_tokens_to_ids( - tokenizer.tokenize(x))) - -dataset = df[["user", "tokens", "task"]] -# shuffle dataset -dataset = dataset.sample(frac=1) - -X = dataset.tokens.values - -def fillZeros(arr): - arr = np.array(arr) - if INPUT_SIZE > arr.shape[0]: - arr = np.pad(arr, (0, INPUT_SIZE - arr.shape[0]), 'constant') - else: - arr = arr[:INPUT_SIZE] - return arr.reshape(INPUT_SIZE, 1).tolist() - -X = np.array([fillZeros(x) for x in X]) -X = X.reshape((-1, INPUT_SIZE)) -y = np.array(dataset.user) -tasks = np.array(dataset.task) -train_indexes = np.where(tasks < 7)[0] -test_indexes = np.where(tasks >= 7)[0] -X_train, X_test = X[train_indexes], X[test_indexes] -y_train, y_test = y[train_indexes], y[test_indexes] # 244 unique person + + +def generate_data(): + df = pd.read_csv(df_path) + # df = df.drop(columns=["round", "task", "solution", "file", + # "full_path", "Unnamed: 0.1", "Unnamed: 0", "lang"]) + # df["n_lines"] = df.flines.apply(lambda x: str(x).count("\n")) + # df = df[(df.n_lines > 0)] + + + # def _insert_tokens(x: str): + # x = x.replace("\n", " NLN ") + # x = x.replace("\t", " TAB ") + # x = x.replace(" ", " SPC ") + # return x + # + # df.flines = df.flines.apply(_insert_tokens) + + # load tokenizer + tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm") + df.index = np.arange(len(df)) + le = LabelEncoder() + df.user = le.fit_transform(df.user) + df['tokens'] = df.flines.apply(lambda x: + tokenizer.convert_tokens_to_ids( + tokenizer.tokenize(x))) + + dataset = df[["user", "tokens", "task"]] + # shuffle dataset + dataset = dataset.sample(frac=1) + + X = dataset.tokens.values + + def fillZeros(arr): + arr = np.array(arr) + if INPUT_SIZE > arr.shape[0]: + arr = np.pad(arr, (0, INPUT_SIZE - arr.shape[0]), 'constant') + else: + arr = arr[:INPUT_SIZE] + return arr.reshape(INPUT_SIZE, 1).tolist() + + X = np.array([fillZeros(x) for x in X]) + X = X.reshape((-1, INPUT_SIZE)) + y = np.array(dataset.user) + tasks = np.array(dataset.task) + train_indexes = np.where(tasks < 7)[0] + test_indexes = np.where(tasks >= 7)[0] + X_train, X_test = X[train_indexes], X[test_indexes] + y_train, y_test = y[train_indexes], y[test_indexes] # 244 unique person + + embedding_model = RobertaModel.from_pretrained("microsoft/codebert-base") + + x_emb = [] + with torch.no_grad(): + for i in tqdm.tqdm(range(0, X_test.shape[0], BATCH_SIZE)): + xs = X_train[i: i+BATCH_SIZE] + new_xs = embedding_model(torch.from_numpy(xs)).last_hidden_state + x_emb = [*x_emb, *new_xs] + + # save x_emb, x_train, y_test, y_train + + np.save('x_train.np', X_train) + np.save('y_test.np', y_test) + np.save('y_train.np', y_train) + np.save('x_test.np', X_test) + + for idx, tensor in enumerate(x_emb): + torch.save(tensor, f"test_tensors/tensor{idx}.pt") + + +# generate_data() +print('restoring') + +X_train = np.load('x_train.np.npy') +y_test = np.load('y_test.np.npy') +y_train = np.load('y_train.np.npy') +X_test = np.load('x_test.np.npy') +x_emb = [torch.load(f"test_tensors/tensor{idx}.pt") for idx in range(X_test.shape[0])] # -------------------------- model architecture @@ -95,7 +126,7 @@ def __init__(self, X_train, y_train, batch_size = BATCH_SIZE): def batch_generator(self, model, tree): n_positive = self.batch_size // 2 anchor_index = np.random.choice(self.y.shape[0], 1) - y_anchor = y[anchor_index] + y_anchor = self.y[anchor_index] positive_indexes = np.where(self.y == y_anchor)[0] n_same = positive_indexes.shape[0] positive_indexes = positive_indexes[:n_positive] @@ -193,20 +224,6 @@ def init_weights(m): optimizer = optim.Adam(model.parameters(), lr=0.001) criterion = torch.jit.script(TripletLoss()) - -# test_emb = embedding_model(torch.from_numpy(X_test)).last_hidden_state -x_emb = [] -for i in tqdm.tqdm(range(0, X_test.shape[0], BATCH_SIZE)): - xs = X_train[i: i+BATCH_SIZE] - new_xs = embedding_model(torch.from_numpy(xs)).last_hidden_state - x_emb = [*x_emb, *new_xs] - -x_emb = np.array(x_emb) -# Connected to pydev debugger (build 213.5744.248) -# 16%|█▌ | 5/31 [03:37<18:48, 43.42s/it]/usr/local/Cellar/python@3.9/3.9.7/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown -# warnings.warn('resource_tracker: There appear to be %d ' -# -# Process finished with exit code 137 (interrupted by signal 9: SIGKILL) callback = AccuracyEvaluator(x_emb, y_test, input_size=768) # training loop @@ -215,9 +232,10 @@ def init_weights(m): running_loss = [] for step in enumerate(tqdm.tqdm(range(len(np.unique(y_train))), desc="Training", leave=False)): anchor, positive, negative = data_loader.batch_generator(model, tree) - anchor = embedding_model(anchor).last_hidden_state - positive = embedding_model(positive).last_hidden_state - negative = embedding_model(negative).last_hidden_state + with torch.no_grad(): + anchor = embedding_model(torch.from_numpy(anchor)).last_hidden_state + positive = embedding_model(torch.from_numpy(positive)).last_hidden_state + negative = embedding_model(torch.from_numpy(negative)).last_hidden_state optimizer.zero_grad() From 77791ca1484a9e737d2bc3cdf1f7bc3b86c53bec Mon Sep 17 00:00:00 2001 From: MefAldemisov Date: Wed, 9 Feb 2022 21:58:12 +0300 Subject: [PATCH 3/9] Added embeddings for training set --- src/bert_attempts/AccuracyEvaluator.py | 27 ++++++++++++++------------ src/bert_attempts/BertBased.py | 21 ++++++++++++++++---- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/src/bert_attempts/AccuracyEvaluator.py b/src/bert_attempts/AccuracyEvaluator.py index 2171d53..376b237 100644 --- a/src/bert_attempts/AccuracyEvaluator.py +++ b/src/bert_attempts/AccuracyEvaluator.py @@ -12,9 +12,9 @@ class AccuracyEvaluator: def __init__(self, - # X_train: np.ndarray, + X_train, X_test, - # y_train: np.ndarray, + y_train: np.ndarray, y_test: np.ndarray, threshold: float = 0.1, input_size: int = 500, @@ -37,20 +37,23 @@ def __init__(self, def select_authors(initial_x, initial_y): index = np.where(np.isin(initial_y, self.authors))[0] - new_x = [initial_x[i] for i in index] + new_x = initial_x[index] new_y = initial_y[index] return new_x, new_y - # simple_x_train, simple_y_train = select_authors(X_train, y_train) + X_test = torch.cat(X_test) + X_train = torch.cat(X_train) + + simple_x_train, simple_y_train = select_authors(X_train, y_train) simple_x_test, simple_y_test = select_authors(X_test, y_test) self.data = { "simple": { - # "train": [simple_x_train, simple_y_train], + "train": [simple_x_train, simple_y_train], "test": [simple_x_test, simple_y_test] }, "full": { - # "train": [X_train, y_train], + "train": [X_train, y_train], "test": [X_test, y_test] } } @@ -67,7 +70,7 @@ def _plot_to_image(figure): buf.seek(0) def apply_dimensionality_reduction(self, - transformed_x: np.ndarray, + transformed_x, y: np.ndarray, epoch: int, is_test: bool): @@ -100,8 +103,8 @@ def get_acc(self, knn = KNeighborsClassifier().fit(transformed_x, y) predictions = knn.predict(transformed_x) accuracy = accuracy_score(y_true=y, y_pred=predictions) - # if dim_red: - # self.apply_dimensionality_reduction(transformed_x, y, epoch, is_test) + if dim_red: + self.apply_dimensionality_reduction(transformed_x, y, epoch, is_test) return accuracy def _writer(self, @@ -120,10 +123,10 @@ def on_epoch_end(self, epoch: int, loss: float): - # astr = self._writer(*self.data["simple"]["train"], model, epoch, False, True) + astr = self._writer(*self.data["simple"]["train"], model, epoch, False, True) aste = self._writer(*self.data["simple"]["test"], model, epoch, True, True) afte = self._writer(*self.data["full"]["test"], model, epoch, True, False) - print(loss, aste, afte) + print(loss,astr, aste, afte) self.n += 1 - return aste, afte + return astr, aste, afte diff --git a/src/bert_attempts/BertBased.py b/src/bert_attempts/BertBased.py index 533d816..1efad7a 100644 --- a/src/bert_attempts/BertBased.py +++ b/src/bert_attempts/BertBased.py @@ -85,6 +85,12 @@ def fillZeros(arr): new_xs = embedding_model(torch.from_numpy(xs)).last_hidden_state x_emb = [*x_emb, *new_xs] + x_train_emb = [] + with torch.no_grad(): + for i in tqdm.tqdm(range(0, X_train.shape[0], BATCH_SIZE)): + xs = X_train[i: i+BATCH_SIZE] + new_xs = embedding_model(torch.from_numpy(xs)).last_hidden_state + x_train_emb = [*x_train_emb, *new_xs] # save x_emb, x_train, y_test, y_train np.save('x_train.np', X_train) @@ -95,8 +101,12 @@ def fillZeros(arr): for idx, tensor in enumerate(x_emb): torch.save(tensor, f"test_tensors/tensor{idx}.pt") + for idx, tensor in enumerate(x_train_emb): + torch.save(tensor, f"train_tensors/tensor{idx}.pt") -# generate_data() + + +generate_data() print('restoring') X_train = np.load('x_train.np.npy') @@ -104,6 +114,7 @@ def fillZeros(arr): y_train = np.load('y_train.np.npy') X_test = np.load('x_test.np.npy') x_emb = [torch.load(f"test_tensors/tensor{idx}.pt") for idx in range(X_test.shape[0])] +x_train_emb = [torch.load(f"train_tensors/tensor{idx}.pt") for idx in range(X_test.shape[0])] # -------------------------- model architecture @@ -224,8 +235,9 @@ def init_weights(m): optimizer = optim.Adam(model.parameters(), lr=0.001) criterion = torch.jit.script(TripletLoss()) -callback = AccuracyEvaluator(x_emb, y_test, input_size=768) +callback = AccuracyEvaluator(x_train_emb, x_emb, y_train, y_test, input_size=768) +x_train_emb = torch.cat(x_train_emb) # training loop model.train() for epoch in tqdm.tqdm(range(N_EPOCHS), desc="Epochs"): @@ -247,8 +259,9 @@ def init_weights(m): loss.backward() optimizer.step() - # predictions = model(x_emb) - # tree = BallTree(predictions, metric="euclidean") + with torch.no_grad(): + predictions = model(x_train_emb) + tree = BallTree(predictions, metric="euclidean") current_loss = loss.cpu().detach().numpy() print(current_loss) From fe83565bc735ca427e4883e90b4e9ae8035b3ed9 Mon Sep 17 00:00:00 2001 From: MefAldemisov Date: Fri, 11 Feb 2022 09:34:55 +0300 Subject: [PATCH 4/9] actual first training --- src/bert_attempts/AccuracyEvaluator.py | 9 ++--- src/bert_attempts/BertBased.py | 54 +++++++++++++------------- 2 files changed, 31 insertions(+), 32 deletions(-) diff --git a/src/bert_attempts/AccuracyEvaluator.py b/src/bert_attempts/AccuracyEvaluator.py index 376b237..e1ab84c 100644 --- a/src/bert_attempts/AccuracyEvaluator.py +++ b/src/bert_attempts/AccuracyEvaluator.py @@ -41,9 +41,6 @@ def select_authors(initial_x, initial_y): new_y = initial_y[index] return new_x, new_y - X_test = torch.cat(X_test) - X_train = torch.cat(X_train) - simple_x_train, simple_y_train = select_authors(X_train, y_train) simple_x_test, simple_y_test = select_authors(X_test, y_test) @@ -82,7 +79,7 @@ def apply_dimensionality_reduction(self, indexes = np.where(y == developer)[0] plt.plot(x_pca[indexes, 0], x_pca[indexes, 1], "o", ms=5) # save as file - plt.savefig("../outputs/tsne_{}/tsne_{}.png".format('bert', self.n)) + plt.savefig("outputs/tsne_{}.png".format( self.n)) # log to tensorboard # image = self._plot_to_image(figure) # writer = self.test_summary_writer if is_test else self.train_summary_writer @@ -99,7 +96,7 @@ def get_acc(self, is_test: bool, dim_red: True) -> float: with torch.no_grad(): - transformed_x = model(torch.cat(x)) + transformed_x = model(x) knn = KNeighborsClassifier().fit(transformed_x, y) predictions = knn.predict(transformed_x) accuracy = accuracy_score(y_true=y, y_pred=predictions) @@ -123,8 +120,8 @@ def on_epoch_end(self, epoch: int, loss: float): - astr = self._writer(*self.data["simple"]["train"], model, epoch, False, True) aste = self._writer(*self.data["simple"]["test"], model, epoch, True, True) + astr = self._writer(*self.data["simple"]["train"], model, epoch, False, True) afte = self._writer(*self.data["full"]["test"], model, epoch, True, False) print(loss,astr, aste, afte) diff --git a/src/bert_attempts/BertBased.py b/src/bert_attempts/BertBased.py index 1efad7a..ff08f14 100644 --- a/src/bert_attempts/BertBased.py +++ b/src/bert_attempts/BertBased.py @@ -22,7 +22,7 @@ INPUT_SIZE = 512 # 514 tokens, maximum for bert OUTPUT_SIZE = 256 -N_EPOCHS = 100 +N_EPOCHS = 30 BATCH_SIZE = 16 # -------------------------- load data @@ -85,6 +85,12 @@ def fillZeros(arr): new_xs = embedding_model(torch.from_numpy(xs)).last_hidden_state x_emb = [*x_emb, *new_xs] + np.save('x_train.np', X_train) + np.save('y_test.np', y_test) + np.save('y_train.np', y_train) + np.save('x_test.np', X_test) + torch.save(torch.cat(x_emb), 'test_tensor.pt') + print("main part saved") x_train_emb = [] with torch.no_grad(): for i in tqdm.tqdm(range(0, X_train.shape[0], BATCH_SIZE)): @@ -93,28 +99,19 @@ def fillZeros(arr): x_train_emb = [*x_train_emb, *new_xs] # save x_emb, x_train, y_test, y_train - np.save('x_train.np', X_train) - np.save('y_test.np', y_test) - np.save('y_train.np', y_train) - np.save('x_test.np', X_test) - - for idx, tensor in enumerate(x_emb): - torch.save(tensor, f"test_tensors/tensor{idx}.pt") + torch.save(torch.cat(x_train_emb), 'train_tensor.pt') - for idx, tensor in enumerate(x_train_emb): - torch.save(tensor, f"train_tensors/tensor{idx}.pt") - - - -generate_data() +# generate_data() print('restoring') X_train = np.load('x_train.np.npy') y_test = np.load('y_test.np.npy') y_train = np.load('y_train.np.npy') X_test = np.load('x_test.np.npy') -x_emb = [torch.load(f"test_tensors/tensor{idx}.pt") for idx in range(X_test.shape[0])] -x_train_emb = [torch.load(f"train_tensors/tensor{idx}.pt") for idx in range(X_test.shape[0])] +x_emb = torch.load('test_tensor.pt') +x_train_emb = torch.load('train_tensor.pt') +x_emb = torch.reshape(x_emb, (-1, 512, 768)) +x_train_emb = torch.reshape(x_train_emb, (-1, 512, 768)) # -------------------------- model architecture @@ -144,7 +141,8 @@ def batch_generator(self, model, tree): k = self.batch_size - positive_indexes.shape[0] if tree is not None: - query = model(self.x[anchor_index]) + with torch.no_grad(): + query = model(self.x[anchor_index]) query_res = tree.query(query, self.batch_size+n_same, return_distance=False)[0] negative_indexes = np.array([neighbour_index for neighbour_index in query_res if self.y[neighbour_index] != y_anchor])[:k] @@ -153,13 +151,13 @@ def batch_generator(self, model, tree): np.random.shuffle(negative_indexes) negative_indexes = negative_indexes[:k] - local_x = self.x.reshape((-1, INPUT_SIZE)) + local_x = self.x.reshape((-1, INPUT_SIZE, 768)) reduced_indexes = map(lambda indexes: np.random.choice(indexes, self.batch_size), [positive_indexes, negative_indexes]) positive, negative = map(lambda i: local_x[i], reduced_indexes) - anchor = np.array([local_x[anchor_index] for _ in range(self.batch_size)]).reshape((-1, INPUT_SIZE)) + anchor = torch.concat([local_x[anchor_index] for _ in range(self.batch_size)]) return anchor, positive, negative @@ -180,12 +178,14 @@ def __init__(self): self.conv = nn.Sequential( nn.Conv2d(1, self.channels, kernel_size=(k_size, 768),), nn.ReLU(), + nn.Dropout(0.3) ) # for size in conv_sizes # ] self.fc = nn.Sequential( nn.Linear(self.pool_size*self.channels, INPUT_SIZE), nn.ReLU(), + nn.Dropout(0.3), nn.Linear(INPUT_SIZE, OUTPUT_SIZE), nn.ReLU() ) @@ -202,6 +202,8 @@ def forward(self, x): # # configs + + class TripletLoss(nn.Module): def __init__(self, margin=1.0): super(TripletLoss, self).__init__() @@ -217,12 +219,13 @@ def forward(self, anchor: torch.Tensor, positive: torch.Tensor, negative: torch. return losses.mean() + def init_weights(m): if isinstance(m, nn.Conv2d): torch.nn.init.xavier_normal_(m.weight) -data_loader = GCJ(X_train, y_train) +data_loader = GCJ(x_train_emb, y_train) embedding_model = RobertaModel.from_pretrained("microsoft/codebert-base") @@ -234,20 +237,19 @@ def init_weights(m): optimizer = optim.Adam(model.parameters(), lr=0.001) criterion = torch.jit.script(TripletLoss()) - +x_emb = x_emb[:X_test.shape[0]] callback = AccuracyEvaluator(x_train_emb, x_emb, y_train, y_test, input_size=768) -x_train_emb = torch.cat(x_train_emb) # training loop model.train() for epoch in tqdm.tqdm(range(N_EPOCHS), desc="Epochs"): running_loss = [] for step in enumerate(tqdm.tqdm(range(len(np.unique(y_train))), desc="Training", leave=False)): anchor, positive, negative = data_loader.batch_generator(model, tree) - with torch.no_grad(): - anchor = embedding_model(torch.from_numpy(anchor)).last_hidden_state - positive = embedding_model(torch.from_numpy(positive)).last_hidden_state - negative = embedding_model(torch.from_numpy(negative)).last_hidden_state + # with torch.no_grad(): + # anchor = embedding_model(anchor).last_hidden_state + # positive = embedding_model(positive).last_hidden_state + # negative = embedding_model(negative).last_hidden_state optimizer.zero_grad() From 7021f0dd937c3e7d69e4a20a2759c2e601ac8e9f Mon Sep 17 00:00:00 2001 From: MefAldemisov Date: Sun, 13 Feb 2022 13:43:24 +0300 Subject: [PATCH 5/9] Classes are moved to separate files --- .gitignore | 3 + src/bert_attempts/BertBased.py | 225 +++-------------------------- src/bert_attempts/DataGenerator.py | 75 ++++++++++ src/bert_attempts/GCJ.py | 45 ++++++ src/bert_attempts/Network.py | 36 +++++ src/bert_attempts/README.md | 19 +++ src/bert_attempts/TripletLoss.py | 24 +++ 7 files changed, 222 insertions(+), 205 deletions(-) create mode 100644 src/bert_attempts/DataGenerator.py create mode 100644 src/bert_attempts/GCJ.py create mode 100644 src/bert_attempts/Network.py create mode 100644 src/bert_attempts/README.md create mode 100644 src/bert_attempts/TripletLoss.py diff --git a/.gitignore b/.gitignore index 67a7af0..ab03f38 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,9 @@ myenv* embd/* embd *.csv +*.npy +*.pkl +*pt *.h5 *.png *.json diff --git a/src/bert_attempts/BertBased.py b/src/bert_attempts/BertBased.py index ff08f14..e1f3950 100644 --- a/src/bert_attempts/BertBased.py +++ b/src/bert_attempts/BertBased.py @@ -1,14 +1,18 @@ -from transformers import RobertaTokenizer, RobertaModel import tqdm -import numpy as np -import pandas as pd import torch + +import numpy as np import torch.nn as nn import torch.optim as optim -from AccuracyEvaluator import AccuracyEvaluator + from sklearn.neighbors import BallTree -from sklearn.preprocessing import LabelEncoder -# with reference to https://www.kaggle.com/hirotaka0122/triplet-loss-with-pytorch + +from AccuracyEvaluator import AccuracyEvaluator +from GCJ import GCJ +from Network import Network +from TripletLoss import TripletLoss +from DataGenerator import generate_data + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -17,92 +21,13 @@ # -------------------------- constants df_path = '../../inputs/processed_dfs/cpp_9_tasks_2016.csv' -tmp_dataset_dir = "../../inputs/preprocessed_jsons/" -tmp_dataset_filename = tmp_dataset_dir + 'bert' + "_train.json" INPUT_SIZE = 512 # 514 tokens, maximum for bert OUTPUT_SIZE = 256 N_EPOCHS = 30 BATCH_SIZE = 16 -# -------------------------- load data - - -def generate_data(): - df = pd.read_csv(df_path) - # df = df.drop(columns=["round", "task", "solution", "file", - # "full_path", "Unnamed: 0.1", "Unnamed: 0", "lang"]) - # df["n_lines"] = df.flines.apply(lambda x: str(x).count("\n")) - # df = df[(df.n_lines > 0)] - - - # def _insert_tokens(x: str): - # x = x.replace("\n", " NLN ") - # x = x.replace("\t", " TAB ") - # x = x.replace(" ", " SPC ") - # return x - # - # df.flines = df.flines.apply(_insert_tokens) - - # load tokenizer - tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm") - df.index = np.arange(len(df)) - le = LabelEncoder() - df.user = le.fit_transform(df.user) - df['tokens'] = df.flines.apply(lambda x: - tokenizer.convert_tokens_to_ids( - tokenizer.tokenize(x))) - - dataset = df[["user", "tokens", "task"]] - # shuffle dataset - dataset = dataset.sample(frac=1) - - X = dataset.tokens.values - - def fillZeros(arr): - arr = np.array(arr) - if INPUT_SIZE > arr.shape[0]: - arr = np.pad(arr, (0, INPUT_SIZE - arr.shape[0]), 'constant') - else: - arr = arr[:INPUT_SIZE] - return arr.reshape(INPUT_SIZE, 1).tolist() - - X = np.array([fillZeros(x) for x in X]) - X = X.reshape((-1, INPUT_SIZE)) - y = np.array(dataset.user) - tasks = np.array(dataset.task) - train_indexes = np.where(tasks < 7)[0] - test_indexes = np.where(tasks >= 7)[0] - X_train, X_test = X[train_indexes], X[test_indexes] - y_train, y_test = y[train_indexes], y[test_indexes] # 244 unique person - - embedding_model = RobertaModel.from_pretrained("microsoft/codebert-base") - - x_emb = [] - with torch.no_grad(): - for i in tqdm.tqdm(range(0, X_test.shape[0], BATCH_SIZE)): - xs = X_train[i: i+BATCH_SIZE] - new_xs = embedding_model(torch.from_numpy(xs)).last_hidden_state - x_emb = [*x_emb, *new_xs] - - np.save('x_train.np', X_train) - np.save('y_test.np', y_test) - np.save('y_train.np', y_train) - np.save('x_test.np', X_test) - torch.save(torch.cat(x_emb), 'test_tensor.pt') - print("main part saved") - x_train_emb = [] - with torch.no_grad(): - for i in tqdm.tqdm(range(0, X_train.shape[0], BATCH_SIZE)): - xs = X_train[i: i+BATCH_SIZE] - new_xs = embedding_model(torch.from_numpy(xs)).last_hidden_state - x_train_emb = [*x_train_emb, *new_xs] - # save x_emb, x_train, y_test, y_train - - torch.save(torch.cat(x_train_emb), 'train_tensor.pt') - -# generate_data() -print('restoring') +generate_data(df_path, INPUT_SIZE, OUTPUT_SIZE) X_train = np.load('x_train.np.npy') y_test = np.load('y_test.np.npy') @@ -110,146 +35,36 @@ def fillZeros(arr): X_test = np.load('x_test.np.npy') x_emb = torch.load('test_tensor.pt') x_train_emb = torch.load('train_tensor.pt') +# todo: remove reshaping (looks suspicious) x_emb = torch.reshape(x_emb, (-1, 512, 768)) x_train_emb = torch.reshape(x_train_emb, (-1, 512, 768)) -# -------------------------- model architecture - -# let's do just a simple thing - -# 1. embedding from bert -> INPUT_SIZE * 768 -# 2. convolution (5*768) -# 3. fully-connected 500 -# 4. fully connected 100 - -# 1. pretrained part - -# train loader -class GCJ: - def __init__(self, X_train, y_train, batch_size = BATCH_SIZE): - self.x = X_train - self.y = y_train - self.batch_size = batch_size - - def batch_generator(self, model, tree): - n_positive = self.batch_size // 2 - anchor_index = np.random.choice(self.y.shape[0], 1) - y_anchor = self.y[anchor_index] - positive_indexes = np.where(self.y == y_anchor)[0] - n_same = positive_indexes.shape[0] - positive_indexes = positive_indexes[:n_positive] - k = self.batch_size - positive_indexes.shape[0] - - if tree is not None: - with torch.no_grad(): - query = model(self.x[anchor_index]) - query_res = tree.query(query, self.batch_size+n_same, return_distance=False)[0] - negative_indexes = np.array([neighbour_index for neighbour_index in query_res - if self.y[neighbour_index] != y_anchor])[:k] - else: # the first batch generation - negative_indexes = np.where(self.y != y_anchor)[0] - np.random.shuffle(negative_indexes) - negative_indexes = negative_indexes[:k] - - local_x = self.x.reshape((-1, INPUT_SIZE, 768)) - - reduced_indexes = map(lambda indexes: np.random.choice(indexes, self.batch_size), - [positive_indexes, negative_indexes]) - - positive, negative = map(lambda i: local_x[i], reduced_indexes) - anchor = torch.concat([local_x[anchor_index] for _ in range(self.batch_size)]) - - return anchor, positive, negative - - def generator(self, model, tree): - while True: - yield self.batch_generator(model, tree) - - -# model - -class Network(nn.Module): - def __init__(self): - super(Network, self).__init__() - # conv_sizes = [2, 4, 16] - k_size = 8 - self.pool_size = INPUT_SIZE - k_size + 1 # output for conv - self.channels = 4 - self.conv = nn.Sequential( - nn.Conv2d(1, self.channels, kernel_size=(k_size, 768),), - nn.ReLU(), - nn.Dropout(0.3) - ) - # for size in conv_sizes - # ] - self.fc = nn.Sequential( - nn.Linear(self.pool_size*self.channels, INPUT_SIZE), - nn.ReLU(), - nn.Dropout(0.3), - nn.Linear(INPUT_SIZE, OUTPUT_SIZE), - nn.ReLU() - ) - - def forward(self, x): - # array = [conv(x) for conv in self.conv] - x = torch.reshape(x, (-1, 1, 512, 768)) - x = self.conv(x) - x = x.view(-1, self.channels*self.pool_size) - # x = torch.concat(array, dim=1) - x = self.fc(x) - return x - -# - -# configs - - -class TripletLoss(nn.Module): - def __init__(self, margin=1.0): - super(TripletLoss, self).__init__() - self.margin = margin - - def calc_euclidean(self, x1, x2): - return (x1 - x2).pow(2).sum(1) - - def forward(self, anchor: torch.Tensor, positive: torch.Tensor, negative: torch.Tensor) -> torch.Tensor: - distance_positive = self.calc_euclidean(anchor, positive) - distance_negative = self.calc_euclidean(anchor, negative) - losses = torch.relu(distance_positive - distance_negative + self.margin) - - return losses.mean() - def init_weights(m): if isinstance(m, nn.Conv2d): torch.nn.init.xavier_normal_(m.weight) -data_loader = GCJ(x_train_emb, y_train) - -embedding_model = RobertaModel.from_pretrained("microsoft/codebert-base") - -model = Network() +data_loader = GCJ(x_train_emb, y_train, BATCH_SIZE, INPUT_SIZE) +model = Network(INPUT_SIZE, OUTPUT_SIZE) model.apply(init_weights) model = torch.jit.script(model).to(device) -tree = None # default value +tree = None # default value -optimizer = optim.Adam(model.parameters(), lr=0.001) +optimizer = optim.Adam(model.parameters(), lr=0.01) criterion = torch.jit.script(TripletLoss()) +# todo: check, why x_emb = x_emb[:X_test.shape[0]] callback = AccuracyEvaluator(x_train_emb, x_emb, y_train, y_test, input_size=768) # training loop model.train() +params = [] for epoch in tqdm.tqdm(range(N_EPOCHS), desc="Epochs"): running_loss = [] for step in enumerate(tqdm.tqdm(range(len(np.unique(y_train))), desc="Training", leave=False)): anchor, positive, negative = data_loader.batch_generator(model, tree) - # with torch.no_grad(): - # anchor = embedding_model(anchor).last_hidden_state - # positive = embedding_model(positive).last_hidden_state - # negative = embedding_model(negative).last_hidden_state optimizer.zero_grad() @@ -266,11 +81,11 @@ def init_weights(m): tree = BallTree(predictions, metric="euclidean") current_loss = loss.cpu().detach().numpy() - print(current_loss) running_loss.append(current_loss) # callback (accuracy) metrics = callback.on_epoch_end(model, epoch, current_loss) print(metrics) + params.append(metrics) - print("Epoch: {}/{} - Loss: {:.4f}".format(epoch + 1, N_EPOCHS, np.mean(running_loss))) \ No newline at end of file + print("Epoch: {}/{} - Loss: {:.4f}".format(epoch + 1, N_EPOCHS, np.mean(running_loss))) diff --git a/src/bert_attempts/DataGenerator.py b/src/bert_attempts/DataGenerator.py new file mode 100644 index 0000000..1d57d67 --- /dev/null +++ b/src/bert_attempts/DataGenerator.py @@ -0,0 +1,75 @@ +import torch +import tqdm + +import pandas as pd +import numpy as np + +from sklearn.preprocessing import LabelEncoder +from transformers import RobertaTokenizer, RobertaModel + + +def generate_data(df_path, INPUT_SIZE, BATCH_SIZE): + df = pd.read_csv(df_path) + # df = df.drop(columns=["round", "task", "solution", "file", + # "full_path", "Unnamed: 0.1", "Unnamed: 0", "lang"]) + # df["n_lines"] = df.flines.apply(lambda x: str(x).count("\n")) + # df = df[(df.n_lines > 0)] + + # def _insert_tokens(x: str): + # x = x.replace("\n", " NLN ") + # x = x.replace("\t", " TAB ") + # x = x.replace(" ", " SPC ") + # return x + # + # df.flines = df.flines.apply(_insert_tokens) + + # load tokenizer + tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm") + df.index = np.arange(len(df)) + le = LabelEncoder() + df.user = le.fit_transform(df.user) + df['tokens'] = df.flines.apply(lambda x: tokenizer + .convert_tokens_to_ids(tokenizer.tokenize(x))) + + dataset = df[["user", "tokens", "task"]] + # shuffle dataset + dataset = dataset.sample(frac=1) + + X = dataset.tokens.values + + def fill_zeros(arr): + arr = np.array(arr) + if INPUT_SIZE > arr.shape[0]: + arr = np.pad(arr, (0, INPUT_SIZE - arr.shape[0]), 'constant') + else: + arr = arr[:INPUT_SIZE] + return arr.reshape(INPUT_SIZE, 1).tolist() + + X = np.array([fill_zeros(x) for x in X]) + X = X.reshape((-1, INPUT_SIZE)) + y = np.array(dataset.user) + tasks = np.array(dataset.task) + train_indexes = np.where(tasks < 7)[0] + test_indexes = np.where(tasks >= 7)[0] + X_train, X_test = X[train_indexes], X[test_indexes] + y_train, y_test = y[train_indexes], y[test_indexes] + + embedding_model = RobertaModel.from_pretrained("microsoft/codebert-base") + + def get_embedding(data): + emb = [] + with torch.no_grad(): + for i in tqdm.tqdm(range(0, data.shape[0], BATCH_SIZE)): + batch = data[i: i+BATCH_SIZE] + new_part = embedding_model(torch.from_numpy(batch)).last_hidden_state + emb = [*emb, *new_part] + return emb + + x_emb = get_embedding(X_test) + np.save('x_train.np', X_train) + np.save('y_test.np', y_test) + np.save('y_train.np', y_train) + np.save('x_test.np', X_test) + torch.save(torch.cat(x_emb), 'test_tensor.pt') + x_train_emb = get_embedding(X_train) + torch.save(torch.cat(x_train_emb), 'train_tensor.pt') diff --git a/src/bert_attempts/GCJ.py b/src/bert_attempts/GCJ.py new file mode 100644 index 0000000..b6c3de0 --- /dev/null +++ b/src/bert_attempts/GCJ.py @@ -0,0 +1,45 @@ +import torch +import numpy as np + +''' +The loader of the train data (batch generator) +''' + + +class GCJ: + + def __init__(self, X_train, y_train, batch_size, input_size): + self.x = X_train + self.y = y_train + self.batch_size = batch_size + self.input_size = input_size + + def batch_generator(self, model, tree): + n_positive = self.batch_size // 2 + anchor_index = np.random.choice(self.y.shape[0], 1) + y_anchor = self.y[anchor_index] + positive_indexes = np.where(self.y == y_anchor)[0] + n_same = positive_indexes.shape[0] + positive_indexes = positive_indexes[:n_positive] + k = self.batch_size - positive_indexes.shape[0] + + if tree is not None: + with torch.no_grad(): + query = model(self.x[anchor_index]) + query_res = tree.query(query, self.batch_size+n_same, return_distance=False)[0] + negative_indexes = np.array([neighbour_index for neighbour_index in query_res + if self.y[neighbour_index] != y_anchor])[:k] + else: # the first batch generation + negative_indexes = np.where(self.y != y_anchor)[0] + np.random.shuffle(negative_indexes) + negative_indexes = negative_indexes[:k] + + local_x = self.x.reshape((-1, self.input_size, 768)) + + reduced_indexes = map(lambda indexes: np.random.choice(indexes, self.batch_size), + [positive_indexes, negative_indexes]) + + positive, negative = map(lambda i: local_x[i], reduced_indexes) + anchor = torch.concat([local_x[anchor_index] for _ in range(self.batch_size)]) + + return anchor, positive, negative diff --git a/src/bert_attempts/Network.py b/src/bert_attempts/Network.py new file mode 100644 index 0000000..2656e75 --- /dev/null +++ b/src/bert_attempts/Network.py @@ -0,0 +1,36 @@ +import torch +from torch import nn + + +class Network(nn.Module): + def __init__(self, input_size, output_size): + super(Network, self).__init__() + + self.input_size = input_size + self.output_size = output_size + # conv_sizes = [2, 4, 16] + k_size = 8 + self.pool_size = self.input_size - k_size + 1 # output for conv + self.channels = 4 + self.conv = nn.Sequential( + nn.Conv2d(1, self.channels, kernel_size=(k_size, 768),), + nn.ReLU(), + nn.Dropout(0.3) + ) + # for size in conv_sizes + self.fc = nn.Sequential( + nn.Linear(self.pool_size*self.channels, self.input_size), + nn.ReLU(), + nn.Dropout(0.3), + nn.Linear(self.input_size, self.output_size), + nn.ReLU() + ) + + def forward(self, x): + # array = [conv(x) for conv in self.conv] + x = torch.reshape(x, (-1, 1, self.input_size, 768)) + x = self.conv(x) + x = x.view(-1, self.channels*self.pool_size) + # x = torch.concat(array, dim=1) + x = self.fc(x) + return x diff --git a/src/bert_attempts/README.md b/src/bert_attempts/README.md new file mode 100644 index 0000000..066dbb8 --- /dev/null +++ b/src/bert_attempts/README.md @@ -0,0 +1,19 @@ +# Bert - like attempts + +## Current tasks +- [ ] add model saving +- [ ] add loss savings +- [ ] limit the number of tests evaluations => speed-up the training +- [x] split the file on separate classes +- [ ] change model to one in `Embedding.py` (regularization, parallel evaluation) +- [ ] debug + +## Execution: +1. Start file - `BertBased.py` +2. The line with data generation: + +```python +generate_data(df_path, INPUT_SIZE, OUTPUT_SIZE) +``` + +It should be commented if the data is already generated (executes on CPU, takes ~1.5h) \ No newline at end of file diff --git a/src/bert_attempts/TripletLoss.py b/src/bert_attempts/TripletLoss.py new file mode 100644 index 0000000..875959a --- /dev/null +++ b/src/bert_attempts/TripletLoss.py @@ -0,0 +1,24 @@ +import torch +from torch import nn + +''' +with reference to https://www.kaggle.com/hirotaka0122/triplet-loss-with-pytorch +''' + + +class TripletLoss(nn.Module): + + def __init__(self, margin=0.1): + super(TripletLoss, self).__init__() + self.margin = margin + + @staticmethod + def calc_euclidean(x1, x2): + return (x1 - x2).pow(2).sum(1) + + def forward(self, anchor: torch.Tensor, positive: torch.Tensor, negative: torch.Tensor) -> torch.Tensor: + distance_positive = self.calc_euclidean(anchor, positive) + distance_negative = self.calc_euclidean(anchor, negative) + # strainge idea to use ReLU instead of max + losses = torch.relu(distance_positive - distance_negative + self.margin) + return losses.mean() From 35d5899fd97903578527393a79e774f8d449a1bd Mon Sep 17 00:00:00 2001 From: MefAldemisov Date: Sun, 13 Feb 2022 14:03:45 +0300 Subject: [PATCH 6/9] Model and data saving added, number of tests evaluations reduced --- src/bert_attempts/BertBased.py | 26 ++++++++++++++++---------- src/bert_attempts/README.md | 9 +++++---- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/src/bert_attempts/BertBased.py b/src/bert_attempts/BertBased.py index e1f3950..e90f219 100644 --- a/src/bert_attempts/BertBased.py +++ b/src/bert_attempts/BertBased.py @@ -1,5 +1,6 @@ import tqdm import torch +import pickle import numpy as np import torch.nn as nn @@ -63,7 +64,7 @@ def init_weights(m): params = [] for epoch in tqdm.tqdm(range(N_EPOCHS), desc="Epochs"): running_loss = [] - for step in enumerate(tqdm.tqdm(range(len(np.unique(y_train))), desc="Training", leave=False)): + for step in tqdm.tqdm(range(np.unique(y_train).shape[0]), desc="Training", leave=False): anchor, positive, negative = data_loader.batch_generator(model, tree) optimizer.zero_grad() @@ -76,16 +77,21 @@ def init_weights(m): loss.backward() optimizer.step() - with torch.no_grad(): - predictions = model(x_train_emb) - tree = BallTree(predictions, metric="euclidean") + if (step % 10 == 0): + with torch.no_grad(): + predictions = model(x_train_emb) + tree = BallTree(predictions, metric="euclidean") - current_loss = loss.cpu().detach().numpy() - running_loss.append(current_loss) + current_loss = loss.cpu().detach().numpy() + running_loss.append(current_loss) - # callback (accuracy) - metrics = callback.on_epoch_end(model, epoch, current_loss) - print(metrics) - params.append(metrics) + # callback (accuracy) + metrics = callback.on_epoch_end(model, epoch, current_loss) + print(metrics) + params.append(metrics) + with open('training.pkl', 'wb'): + pickle.dump(params) + + torch.save(model.state_dict(), 'model') print("Epoch: {}/{} - Loss: {:.4f}".format(epoch + 1, N_EPOCHS, np.mean(running_loss))) diff --git a/src/bert_attempts/README.md b/src/bert_attempts/README.md index 066dbb8..7a54f44 100644 --- a/src/bert_attempts/README.md +++ b/src/bert_attempts/README.md @@ -1,12 +1,13 @@ # Bert - like attempts ## Current tasks -- [ ] add model saving -- [ ] add loss savings -- [ ] limit the number of tests evaluations => speed-up the training +- [x] ! add model saving +- [x] ! add loss savings +- [x] ! limit the number of tests evaluations => speed-up the training - [x] split the file on separate classes -- [ ] change model to one in `Embedding.py` (regularization, parallel evaluation) +- [ ] !! change model to one in `Embedding.py` (regularization, parallel evaluation) - [ ] debug +- [ ] separate data from code ## Execution: 1. Start file - `BertBased.py` From 49cdedb3b05cec15b94bdefcbff7ea83a83fefbc Mon Sep 17 00:00:00 2001 From: MefAldemisov Date: Sun, 13 Feb 2022 15:26:59 +0300 Subject: [PATCH 7/9] Added paths, network arch changed --- src/bert_attempts/BertBased.py | 46 ++++++++++++++++++++---------- src/bert_attempts/DataGenerator.py | 14 ++++----- src/bert_attempts/Network.py | 31 +++++++++++++------- src/bert_attempts/README.md | 6 ++-- 4 files changed, 61 insertions(+), 36 deletions(-) diff --git a/src/bert_attempts/BertBased.py b/src/bert_attempts/BertBased.py index e90f219..f3906df 100644 --- a/src/bert_attempts/BertBased.py +++ b/src/bert_attempts/BertBased.py @@ -1,3 +1,4 @@ +import os import tqdm import torch import pickle @@ -22,23 +23,21 @@ # -------------------------- constants df_path = '../../inputs/processed_dfs/cpp_9_tasks_2016.csv' +DATA_PATH = './data/' +TRAIN_PATH = './train/' INPUT_SIZE = 512 # 514 tokens, maximum for bert OUTPUT_SIZE = 256 N_EPOCHS = 30 BATCH_SIZE = 16 -generate_data(df_path, INPUT_SIZE, OUTPUT_SIZE) -X_train = np.load('x_train.np.npy') -y_test = np.load('y_test.np.npy') -y_train = np.load('y_train.np.npy') -X_test = np.load('x_test.np.npy') -x_emb = torch.load('test_tensor.pt') -x_train_emb = torch.load('train_tensor.pt') -# todo: remove reshaping (looks suspicious) -x_emb = torch.reshape(x_emb, (-1, 512, 768)) -x_train_emb = torch.reshape(x_train_emb, (-1, 512, 768)) +def mkdir(dir_name): + # create dirs + try: + os.makedirs(dir_name) + except FileExistsError: + print('Dir exist') def init_weights(m): @@ -46,11 +45,28 @@ def init_weights(m): torch.nn.init.xavier_normal_(m.weight) -data_loader = GCJ(x_train_emb, y_train, BATCH_SIZE, INPUT_SIZE) +mkdir(DATA_PATH) +mkdir(TRAIN_PATH) + model = Network(INPUT_SIZE, OUTPUT_SIZE) model.apply(init_weights) model = torch.jit.script(model).to(device) +generate_data(df_path, DATA_PATH, INPUT_SIZE, BATCH_SIZE=64) + +X_train = np.load(DATA_PATH + 'x_train.np.npy') +y_test = np.load(DATA_PATH + 'y_test.np.npy') +y_train = np.load(DATA_PATH + 'y_train.np.npy') +X_test = np.load(DATA_PATH + 'x_test.np.npy') +x_emb = torch.load(DATA_PATH + 'test_tensor.pt') +x_train_emb = torch.load(DATA_PATH + 'train_tensor.pt') +x_emb = torch.reshape(x_emb, (-1, 512, 768)) +x_train_emb = torch.reshape(x_train_emb, (-1, 512, 768)) + + +data_loader = GCJ(x_train_emb, y_train, BATCH_SIZE, INPUT_SIZE) + + tree = None # default value optimizer = optim.Adam(model.parameters(), lr=0.01) @@ -77,7 +93,7 @@ def init_weights(m): loss.backward() optimizer.step() - if (step % 10 == 0): + if step % 10 == 0: with torch.no_grad(): predictions = model(x_train_emb) tree = BallTree(predictions, metric="euclidean") @@ -89,9 +105,9 @@ def init_weights(m): metrics = callback.on_epoch_end(model, epoch, current_loss) print(metrics) params.append(metrics) - with open('training.pkl', 'wb'): - pickle.dump(params) + with open(TRAIN_PATH + 'training.pkl', 'wb') as f: + pickle.dump(params, f) - torch.save(model.state_dict(), 'model') + torch.save(model.state_dict(), TRAIN_PATH + 'model') print("Epoch: {}/{} - Loss: {:.4f}".format(epoch + 1, N_EPOCHS, np.mean(running_loss))) diff --git a/src/bert_attempts/DataGenerator.py b/src/bert_attempts/DataGenerator.py index 1d57d67..cba9d54 100644 --- a/src/bert_attempts/DataGenerator.py +++ b/src/bert_attempts/DataGenerator.py @@ -8,7 +8,7 @@ from transformers import RobertaTokenizer, RobertaModel -def generate_data(df_path, INPUT_SIZE, BATCH_SIZE): +def generate_data(df_path: str, data_path: str, INPUT_SIZE: int, BATCH_SIZE: int): df = pd.read_csv(df_path) # df = df.drop(columns=["round", "task", "solution", "file", # "full_path", "Unnamed: 0.1", "Unnamed: 0", "lang"]) @@ -66,10 +66,10 @@ def get_embedding(data): return emb x_emb = get_embedding(X_test) - np.save('x_train.np', X_train) - np.save('y_test.np', y_test) - np.save('y_train.np', y_train) - np.save('x_test.np', X_test) - torch.save(torch.cat(x_emb), 'test_tensor.pt') + np.save(data_path + 'x_train.np', X_train) + np.save(data_path + 'y_test.np', y_test) + np.save(data_path + 'y_train.np', y_train) + np.save(data_path + 'x_test.np', X_test) + torch.save(torch.cat(x_emb), data_path + 'test_tensor.pt') x_train_emb = get_embedding(X_train) - torch.save(torch.cat(x_train_emb), 'train_tensor.pt') + torch.save(torch.cat(x_train_emb), data_path + 'train_tensor.pt') diff --git a/src/bert_attempts/Network.py b/src/bert_attempts/Network.py index 2656e75..f94c35a 100644 --- a/src/bert_attempts/Network.py +++ b/src/bert_attempts/Network.py @@ -8,17 +8,24 @@ def __init__(self, input_size, output_size): self.input_size = input_size self.output_size = output_size - # conv_sizes = [2, 4, 16] - k_size = 8 - self.pool_size = self.input_size - k_size + 1 # output for conv - self.channels = 4 - self.conv = nn.Sequential( - nn.Conv2d(1, self.channels, kernel_size=(k_size, 768),), - nn.ReLU(), - nn.Dropout(0.3) - ) - # for size in conv_sizes + self.conv_sizes = [2, 4, 16] + self.pool_size = sum([self.input_size - size + 1 for size in self.conv_sizes]) # output for conv + self.channels = 2 + self.conv1 = nn.Sequential( + nn.Conv2d(1, self.channels, kernel_size=(2, 768),), + nn.ReLU(), + ) + self.conv2 = nn.Sequential( + nn.Conv2d(1, self.channels, kernel_size=(4, 768),), + nn.ReLU(), + ) + self.conv3 = nn.Sequential( + nn.Conv2d(1, self.channels, kernel_size=(16, 768),), + nn.ReLU(), + ) + self.fc = nn.Sequential( + nn.Dropout(0.5), nn.Linear(self.pool_size*self.channels, self.input_size), nn.ReLU(), nn.Dropout(0.3), @@ -29,7 +36,9 @@ def __init__(self, input_size, output_size): def forward(self, x): # array = [conv(x) for conv in self.conv] x = torch.reshape(x, (-1, 1, self.input_size, 768)) - x = self.conv(x) + + # torch.view(-1, self.channels * self.input_size - size + 1) + x = torch.cat([self.conv1(x), self.conv2(x), self.conv3(x)]) x = x.view(-1, self.channels*self.pool_size) # x = torch.concat(array, dim=1) x = self.fc(x) diff --git a/src/bert_attempts/README.md b/src/bert_attempts/README.md index 7a54f44..d86a573 100644 --- a/src/bert_attempts/README.md +++ b/src/bert_attempts/README.md @@ -5,9 +5,9 @@ - [x] ! add loss savings - [x] ! limit the number of tests evaluations => speed-up the training - [x] split the file on separate classes -- [ ] !! change model to one in `Embedding.py` (regularization, parallel evaluation) -- [ ] debug -- [ ] separate data from code +- [x] !! change model to one in `Embedding.py` (regularization, parallel evaluation) +- [x] debug +- [x] separate data from code ## Execution: 1. Start file - `BertBased.py` From bf68ec2ddd46e5bb4a75dca1f79ba9117872ba3f Mon Sep 17 00:00:00 2001 From: MefAldemisov Date: Wed, 16 Feb 2022 00:06:26 +0300 Subject: [PATCH 8/9] Two not successful trainings --- src/bert_attempts/BertBased.py | 4 ++-- src/bert_attempts/Network.py | 20 +++++++++++++++----- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/bert_attempts/BertBased.py b/src/bert_attempts/BertBased.py index f3906df..d2b5252 100644 --- a/src/bert_attempts/BertBased.py +++ b/src/bert_attempts/BertBased.py @@ -52,7 +52,7 @@ def init_weights(m): model.apply(init_weights) model = torch.jit.script(model).to(device) -generate_data(df_path, DATA_PATH, INPUT_SIZE, BATCH_SIZE=64) +# generate_data(df_path, DATA_PATH, INPUT_SIZE, BATCH_SIZE=64) X_train = np.load(DATA_PATH + 'x_train.np.npy') y_test = np.load(DATA_PATH + 'y_test.np.npy') @@ -69,7 +69,7 @@ def init_weights(m): tree = None # default value -optimizer = optim.Adam(model.parameters(), lr=0.01) +optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.05) criterion = torch.jit.script(TripletLoss()) # todo: check, why x_emb = x_emb[:X_test.shape[0]] diff --git a/src/bert_attempts/Network.py b/src/bert_attempts/Network.py index f94c35a..832e2ec 100644 --- a/src/bert_attempts/Network.py +++ b/src/bert_attempts/Network.py @@ -9,7 +9,7 @@ def __init__(self, input_size, output_size): self.input_size = input_size self.output_size = output_size self.conv_sizes = [2, 4, 16] - self.pool_size = sum([self.input_size - size + 1 for size in self.conv_sizes]) # output for conv + self.pool_size = [self.input_size - size + 1 for size in self.conv_sizes] # outputs for convs self.channels = 2 self.conv1 = nn.Sequential( nn.Conv2d(1, self.channels, kernel_size=(2, 768),), @@ -25,10 +25,12 @@ def __init__(self, input_size, output_size): ) self.fc = nn.Sequential( + nn.LayerNorm(sum(self.pool_size)*self.channels), nn.Dropout(0.5), - nn.Linear(self.pool_size*self.channels, self.input_size), + nn.Linear(sum(self.pool_size)*self.channels, self.input_size), nn.ReLU(), - nn.Dropout(0.3), + nn.LayerNorm(self.input_size), + nn.Dropout(0.5), nn.Linear(self.input_size, self.output_size), nn.ReLU() ) @@ -38,8 +40,16 @@ def forward(self, x): x = torch.reshape(x, (-1, 1, self.input_size, 768)) # torch.view(-1, self.channels * self.input_size - size + 1) - x = torch.cat([self.conv1(x), self.conv2(x), self.conv3(x)]) - x = x.view(-1, self.channels*self.pool_size) + x1 = self.conv1(x) + x2 = self.conv2(x) + x3 = self.conv3(x) + + x1 = x1.view(-1, self.channels*self.pool_size[0]) + x2 = x2.view(-1, self.channels*self.pool_size[1]) + x3 = x3.view(-1, self.channels*self.pool_size[2]) + + x = torch.cat([x1, x2, x3], -1) + x = x.view(-1, self.channels*sum(self.pool_size)) # x = torch.concat(array, dim=1) x = self.fc(x) return x From e43f6b7aaa9306078802d11e290680c8788ff814 Mon Sep 17 00:00:00 2001 From: MefAldemisov Date: Wed, 27 Apr 2022 22:10:06 +0300 Subject: [PATCH 9/9] Configuration --- src/bert_attempts/BertBased.py | 8 ++++---- src/main.py | 6 +++--- src/models/Embedding.py | 9 +++++---- src/models/data_processing/TokenFeatures.py | 9 ++++----- src/models/data_processing/base/DataLoading.py | 2 +- src/visualization/base/Visualizer.py | 2 +- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/bert_attempts/BertBased.py b/src/bert_attempts/BertBased.py index d2b5252..4de4ad9 100644 --- a/src/bert_attempts/BertBased.py +++ b/src/bert_attempts/BertBased.py @@ -69,7 +69,7 @@ def init_weights(m): tree = None # default value -optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.05) +optimizer = optim.Adam(model.parameters(), lr=10**(-6), weight_decay=0.05) criterion = torch.jit.script(TripletLoss()) # todo: check, why x_emb = x_emb[:X_test.shape[0]] @@ -94,9 +94,9 @@ def init_weights(m): optimizer.step() if step % 10 == 0: - with torch.no_grad(): - predictions = model(x_train_emb) - tree = BallTree(predictions, metric="euclidean") + #with torch.no_grad(): + # predictions = model(x_train_emb) + #tree = BallTree(predictions, metric="euclidean") current_loss = loss.cpu().detach().numpy() running_loss.append(current_loss) diff --git a/src/main.py b/src/main.py index 5dfd8d8..aea8cf4 100644 --- a/src/main.py +++ b/src/main.py @@ -2,7 +2,7 @@ # from training.AvgTriplet import AverageTriplet from models.Embedding import Embedding # from models.Conv2D import Conv2D -from visualization.VisualizerTokenFeatures import VisualizerTokenFeatures +# from visualization.VisualizerTokenFeatures import VisualizerTokenFeatures # from visualization.VisualizerCharFeatures import VisualizerCharFeatures import tensorflow as tf @@ -11,8 +11,8 @@ for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) -model = Embedding(input_size=800, crop=200, output_size=50, make_initial_preprocess=True) -SingleTriplet(model=model).train(batch_size=16, epochs=40, epoch_start=0, step_start=0) +model = Embedding(input_size=800, crop=800, output_size=50, make_initial_preprocess=True) +SingleTriplet(model=model).train(batch_size=16, epochs=50, epoch_start=0, step_start=0) # VisualizerTokenFeatures().run() # model = Conv2D() diff --git a/src/models/Embedding.py b/src/models/Embedding.py index b1a4643..dcde9de 100644 --- a/src/models/Embedding.py +++ b/src/models/Embedding.py @@ -28,9 +28,10 @@ def create_after_emb(self, reshape1, conv_channels=2, emb_height=100, activation="relu", - L2_lambda=0.02, - conv_sizes=[2, 4, 16]): + L2_lambda=0.05, + conv_sizes=[2, 4, 8, 16]): # parallel piece + # d1 = layers.Dropout(0.3)(reshape1) convolutions = [layers.Conv2D(conv_channels, (conv_size, emb_height), name="conv2d_size_{}".format(conv_size), padding="same", activation=activation, @@ -60,8 +61,8 @@ def create_after_emb(self, reshape1, def create_model(self, activation: str = "relu", - L2_lambda: float = 0.02, - conv_sizes: List[int] = [2, 4, 16], + L2_lambda: float = 0.05, + conv_sizes: List[int] = [2, 4, 8, 16], emb_height: int = 100): conv_channels = 2 diff --git a/src/models/data_processing/TokenFeatures.py b/src/models/data_processing/TokenFeatures.py index 5b85a31..0113c2c 100644 --- a/src/models/data_processing/TokenFeatures.py +++ b/src/models/data_processing/TokenFeatures.py @@ -29,7 +29,7 @@ def __init__(self, @staticmethod def _write_vocab_file(filepath: str, vocab: List[str]): - with open(filepath, "w") as f: + with open(filepath, "w", encoding="utf-8") as f: for token in vocab: print(token, file=f) @@ -38,7 +38,7 @@ def _insert_tokens(x: str): x = x.replace("\n", " NLN ") x = x.replace("\t", " TAB ") x = x.replace(" ", " SPC ") - return x + return x.encode("utf-8") def initial_preprocess(self, df_path: str, tmp_dataset_filename: str): df = self._initial_load(df_path) @@ -61,11 +61,11 @@ def initial_preprocess(self, df_path: str, tmp_dataset_filename: str): # reduce the size of the dataset according to the n_tokens df.index = np.arange(len(df)) df["n_tokens"] = df.flines.apply(lambda x: tokenizer.tokenize(x).shape[0]) - df = df[df.n_tokens <= self.input_size] + # df = df[df.n_tokens <= self.input_size] # reindex df.index = np.arange(len(df)) # reduce size - df = self._user_selection_and_encoding(df, 50, 450) + df = self._user_selection_and_encoding(df, 0, 400) # long saving # The issue is that `tokenizer.tokenize()` do not always return a shape (-1, 1). # Some elements of the result of the function could be a list, e.g. [[2929, 8524]]. @@ -104,7 +104,6 @@ def secondary_preprocess(self, tmp_dataset_filename: str): test_indexes = np.where(tasks >= 7)[0] X_train, X_test = X[train_indexes], X[test_indexes] y_train, y_test = y[train_indexes], y[test_indexes] - # X_train, y_train = self._crop_to(X_train, y_train, rs1=(-1, self.crop), rs2=(-1, self.crop, 1)) # X_test, y_test = self._crop_to(X_test, y_test, rs1=(-1, self.crop), rs2=(-1, self.crop, 1)) # self.input_size = self.crop diff --git a/src/models/data_processing/base/DataLoading.py b/src/models/data_processing/base/DataLoading.py index f9d574f..a5a59f0 100644 --- a/src/models/data_processing/base/DataLoading.py +++ b/src/models/data_processing/base/DataLoading.py @@ -87,7 +87,7 @@ def _crop_to(self, return new_X, new_y def preprocess(self, - df_path: str = "../inputs/processed_dfs/cpp_9_tasks_2016.csv", + df_path: str = "../inputs/processed_dfs/valid_py_9_tasks_2020.csv", tmp_dataset_dir: str = "../inputs/preprocessed_jsons/") -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ diff --git a/src/visualization/base/Visualizer.py b/src/visualization/base/Visualizer.py index 750b466..5833267 100644 --- a/src/visualization/base/Visualizer.py +++ b/src/visualization/base/Visualizer.py @@ -23,7 +23,7 @@ def __init__(self, self.model_name = model_name self.snippet_index = snippet_index - self.model = tf.keras.models.load_model('../outputs/{}_0.h'.format(model_name)) + self.model = tf.keras.models.load_model('../outputs/{}_49.h'.format(model_name)) all_x, _, all_y, _ = data_loader.secondary_preprocess("../inputs/preprocessed_jsons/{}_train.json" .format(model_name)) self.triplet_type = AverageTriplet(self.model)