diff --git a/.gitignore b/.gitignore index 67a7af0..ab03f38 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,9 @@ myenv* embd/* embd *.csv +*.npy +*.pkl +*pt *.h5 *.png *.json diff --git a/requirements.txt b/requirements.txt index f5315eb..cb6c546 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,6 @@ tqdm==4.58.0 sentencepiece==0.1.95 pydot==1.4.2 tensorflow-text==2.5.0 +torch==1.10.0 +torchvision==0.11.1 +transformers==4.15.0 diff --git a/src/bert_attempts/AccuracyEvaluator.py b/src/bert_attempts/AccuracyEvaluator.py new file mode 100644 index 0000000..e1ab84c --- /dev/null +++ b/src/bert_attempts/AccuracyEvaluator.py @@ -0,0 +1,129 @@ +import io +import datetime +import numpy as np +import torch +import matplotlib.pyplot as plt + +from typing import List +from sklearn.manifold import TSNE +from sklearn.metrics import accuracy_score +from sklearn.neighbors import KNeighborsClassifier + +class AccuracyEvaluator: + + def __init__(self, + X_train, + X_test, + y_train: np.ndarray, + y_test: np.ndarray, + threshold: float = 0.1, + input_size: int = 500, + authors: List = list(range(20))): + """ + Parameters: + - `X_train`,` X_test` - np.arrays with data (tokens) + - `y_train`, `y_test` - np.arrays, labels (numerical representation of authors) + + - `threshold` - alpha parameter of the triplet loss, threshold for the classification's distance + - `input_size` - amount of tokens in one file + - `authors` - int, prediction stage requires the all-with-all comparison (O(n^2)), + that is why, it is reduced for plotting and evaluating + """ + super().__init__() + self.threshold = threshold + self.input_size = input_size + # x-y preprocessing + self.authors = authors + + def select_authors(initial_x, initial_y): + index = np.where(np.isin(initial_y, self.authors))[0] + new_x = initial_x[index] + new_y = initial_y[index] + return new_x, new_y + + simple_x_train, simple_y_train = select_authors(X_train, y_train) + simple_x_test, simple_y_test = select_authors(X_test, y_test) + + self.data = { + "simple": { + "train": [simple_x_train, simple_y_train], + "test": [simple_x_test, simple_y_test] + }, + "full": { + "train": [X_train, y_train], + "test": [X_test, y_test] + } + } + + # counter initialization + self.n = 0 + + @staticmethod + def _plot_to_image(figure): + # https://www.tensorflow.org/tensorboard/image_summaries + buf = io.BytesIO() + plt.savefig(buf, format="png") + plt.close(figure) + buf.seek(0) + + def apply_dimensionality_reduction(self, + transformed_x, + y: np.ndarray, + epoch: int, + is_test: bool): + vectors = TSNE(n_components=2) + x_pca = vectors.fit_transform(transformed_x) + figure = plt.figure(figsize=(10, 8)) + plt.title("Step {} (epoch {})".format(self.n, epoch)) + for developer in self.authors: + indexes = np.where(y == developer)[0] + plt.plot(x_pca[indexes, 0], x_pca[indexes, 1], "o", ms=5) + # save as file + plt.savefig("outputs/tsne_{}.png".format( self.n)) + # log to tensorboard + # image = self._plot_to_image(figure) + # writer = self.test_summary_writer if is_test else self.train_summary_writer + # with writer.as_default(): + # tf.summary.image("Distribution of authors", image, step=self.n) + + plt.close("all") + + def get_acc(self, + model, + x, + y: np.ndarray, + epoch: int, + is_test: bool, + dim_red: True) -> float: + with torch.no_grad(): + transformed_x = model(x) + knn = KNeighborsClassifier().fit(transformed_x, y) + predictions = knn.predict(transformed_x) + accuracy = accuracy_score(y_true=y, y_pred=predictions) + if dim_red: + self.apply_dimensionality_reduction(transformed_x, y, epoch, is_test) + return accuracy + + def _writer(self, + x, + y, + model, + epoch: int, + is_test: bool, + is_simple: bool) -> float: + + accuracy = self.get_acc(model, x, y, epoch, is_test, is_simple) + return accuracy + + def on_epoch_end(self, + model, + epoch: int, + loss: float): + + aste = self._writer(*self.data["simple"]["test"], model, epoch, True, True) + astr = self._writer(*self.data["simple"]["train"], model, epoch, False, True) + afte = self._writer(*self.data["full"]["test"], model, epoch, True, False) + + print(loss,astr, aste, afte) + self.n += 1 + return astr, aste, afte diff --git a/src/bert_attempts/BertBased.py b/src/bert_attempts/BertBased.py new file mode 100644 index 0000000..4de4ad9 --- /dev/null +++ b/src/bert_attempts/BertBased.py @@ -0,0 +1,113 @@ +import os +import tqdm +import torch +import pickle + +import numpy as np +import torch.nn as nn +import torch.optim as optim + +from sklearn.neighbors import BallTree + +from AccuracyEvaluator import AccuracyEvaluator +from GCJ import GCJ +from Network import Network +from TripletLoss import TripletLoss +from DataGenerator import generate_data + + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +if device.type == "cuda": + torch.cuda.get_device_name() + +# -------------------------- constants +df_path = '../../inputs/processed_dfs/cpp_9_tasks_2016.csv' +DATA_PATH = './data/' +TRAIN_PATH = './train/' + +INPUT_SIZE = 512 # 514 tokens, maximum for bert +OUTPUT_SIZE = 256 +N_EPOCHS = 30 +BATCH_SIZE = 16 + + +def mkdir(dir_name): + # create dirs + try: + os.makedirs(dir_name) + except FileExistsError: + print('Dir exist') + + +def init_weights(m): + if isinstance(m, nn.Conv2d): + torch.nn.init.xavier_normal_(m.weight) + + +mkdir(DATA_PATH) +mkdir(TRAIN_PATH) + +model = Network(INPUT_SIZE, OUTPUT_SIZE) +model.apply(init_weights) +model = torch.jit.script(model).to(device) + +# generate_data(df_path, DATA_PATH, INPUT_SIZE, BATCH_SIZE=64) + +X_train = np.load(DATA_PATH + 'x_train.np.npy') +y_test = np.load(DATA_PATH + 'y_test.np.npy') +y_train = np.load(DATA_PATH + 'y_train.np.npy') +X_test = np.load(DATA_PATH + 'x_test.np.npy') +x_emb = torch.load(DATA_PATH + 'test_tensor.pt') +x_train_emb = torch.load(DATA_PATH + 'train_tensor.pt') +x_emb = torch.reshape(x_emb, (-1, 512, 768)) +x_train_emb = torch.reshape(x_train_emb, (-1, 512, 768)) + + +data_loader = GCJ(x_train_emb, y_train, BATCH_SIZE, INPUT_SIZE) + + +tree = None # default value + +optimizer = optim.Adam(model.parameters(), lr=10**(-6), weight_decay=0.05) +criterion = torch.jit.script(TripletLoss()) +# todo: check, why +x_emb = x_emb[:X_test.shape[0]] +callback = AccuracyEvaluator(x_train_emb, x_emb, y_train, y_test, input_size=768) + +# training loop +model.train() +params = [] +for epoch in tqdm.tqdm(range(N_EPOCHS), desc="Epochs"): + running_loss = [] + for step in tqdm.tqdm(range(np.unique(y_train).shape[0]), desc="Training", leave=False): + anchor, positive, negative = data_loader.batch_generator(model, tree) + + optimizer.zero_grad() + + anchor_out = model(anchor) + positive_out = model(positive) + negative_out = model(negative) + + loss = criterion(anchor_out, positive_out, negative_out) + loss.backward() + optimizer.step() + + if step % 10 == 0: + #with torch.no_grad(): + # predictions = model(x_train_emb) + #tree = BallTree(predictions, metric="euclidean") + + current_loss = loss.cpu().detach().numpy() + running_loss.append(current_loss) + + # callback (accuracy) + metrics = callback.on_epoch_end(model, epoch, current_loss) + print(metrics) + params.append(metrics) + with open(TRAIN_PATH + 'training.pkl', 'wb') as f: + pickle.dump(params, f) + + torch.save(model.state_dict(), TRAIN_PATH + 'model') + + print("Epoch: {}/{} - Loss: {:.4f}".format(epoch + 1, N_EPOCHS, np.mean(running_loss))) diff --git a/src/bert_attempts/DataGenerator.py b/src/bert_attempts/DataGenerator.py new file mode 100644 index 0000000..cba9d54 --- /dev/null +++ b/src/bert_attempts/DataGenerator.py @@ -0,0 +1,75 @@ +import torch +import tqdm + +import pandas as pd +import numpy as np + +from sklearn.preprocessing import LabelEncoder +from transformers import RobertaTokenizer, RobertaModel + + +def generate_data(df_path: str, data_path: str, INPUT_SIZE: int, BATCH_SIZE: int): + df = pd.read_csv(df_path) + # df = df.drop(columns=["round", "task", "solution", "file", + # "full_path", "Unnamed: 0.1", "Unnamed: 0", "lang"]) + # df["n_lines"] = df.flines.apply(lambda x: str(x).count("\n")) + # df = df[(df.n_lines > 0)] + + # def _insert_tokens(x: str): + # x = x.replace("\n", " NLN ") + # x = x.replace("\t", " TAB ") + # x = x.replace(" ", " SPC ") + # return x + # + # df.flines = df.flines.apply(_insert_tokens) + + # load tokenizer + tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm") + df.index = np.arange(len(df)) + le = LabelEncoder() + df.user = le.fit_transform(df.user) + df['tokens'] = df.flines.apply(lambda x: tokenizer + .convert_tokens_to_ids(tokenizer.tokenize(x))) + + dataset = df[["user", "tokens", "task"]] + # shuffle dataset + dataset = dataset.sample(frac=1) + + X = dataset.tokens.values + + def fill_zeros(arr): + arr = np.array(arr) + if INPUT_SIZE > arr.shape[0]: + arr = np.pad(arr, (0, INPUT_SIZE - arr.shape[0]), 'constant') + else: + arr = arr[:INPUT_SIZE] + return arr.reshape(INPUT_SIZE, 1).tolist() + + X = np.array([fill_zeros(x) for x in X]) + X = X.reshape((-1, INPUT_SIZE)) + y = np.array(dataset.user) + tasks = np.array(dataset.task) + train_indexes = np.where(tasks < 7)[0] + test_indexes = np.where(tasks >= 7)[0] + X_train, X_test = X[train_indexes], X[test_indexes] + y_train, y_test = y[train_indexes], y[test_indexes] + + embedding_model = RobertaModel.from_pretrained("microsoft/codebert-base") + + def get_embedding(data): + emb = [] + with torch.no_grad(): + for i in tqdm.tqdm(range(0, data.shape[0], BATCH_SIZE)): + batch = data[i: i+BATCH_SIZE] + new_part = embedding_model(torch.from_numpy(batch)).last_hidden_state + emb = [*emb, *new_part] + return emb + + x_emb = get_embedding(X_test) + np.save(data_path + 'x_train.np', X_train) + np.save(data_path + 'y_test.np', y_test) + np.save(data_path + 'y_train.np', y_train) + np.save(data_path + 'x_test.np', X_test) + torch.save(torch.cat(x_emb), data_path + 'test_tensor.pt') + x_train_emb = get_embedding(X_train) + torch.save(torch.cat(x_train_emb), data_path + 'train_tensor.pt') diff --git a/src/bert_attempts/GCJ.py b/src/bert_attempts/GCJ.py new file mode 100644 index 0000000..b6c3de0 --- /dev/null +++ b/src/bert_attempts/GCJ.py @@ -0,0 +1,45 @@ +import torch +import numpy as np + +''' +The loader of the train data (batch generator) +''' + + +class GCJ: + + def __init__(self, X_train, y_train, batch_size, input_size): + self.x = X_train + self.y = y_train + self.batch_size = batch_size + self.input_size = input_size + + def batch_generator(self, model, tree): + n_positive = self.batch_size // 2 + anchor_index = np.random.choice(self.y.shape[0], 1) + y_anchor = self.y[anchor_index] + positive_indexes = np.where(self.y == y_anchor)[0] + n_same = positive_indexes.shape[0] + positive_indexes = positive_indexes[:n_positive] + k = self.batch_size - positive_indexes.shape[0] + + if tree is not None: + with torch.no_grad(): + query = model(self.x[anchor_index]) + query_res = tree.query(query, self.batch_size+n_same, return_distance=False)[0] + negative_indexes = np.array([neighbour_index for neighbour_index in query_res + if self.y[neighbour_index] != y_anchor])[:k] + else: # the first batch generation + negative_indexes = np.where(self.y != y_anchor)[0] + np.random.shuffle(negative_indexes) + negative_indexes = negative_indexes[:k] + + local_x = self.x.reshape((-1, self.input_size, 768)) + + reduced_indexes = map(lambda indexes: np.random.choice(indexes, self.batch_size), + [positive_indexes, negative_indexes]) + + positive, negative = map(lambda i: local_x[i], reduced_indexes) + anchor = torch.concat([local_x[anchor_index] for _ in range(self.batch_size)]) + + return anchor, positive, negative diff --git a/src/bert_attempts/Network.py b/src/bert_attempts/Network.py new file mode 100644 index 0000000..832e2ec --- /dev/null +++ b/src/bert_attempts/Network.py @@ -0,0 +1,55 @@ +import torch +from torch import nn + + +class Network(nn.Module): + def __init__(self, input_size, output_size): + super(Network, self).__init__() + + self.input_size = input_size + self.output_size = output_size + self.conv_sizes = [2, 4, 16] + self.pool_size = [self.input_size - size + 1 for size in self.conv_sizes] # outputs for convs + self.channels = 2 + self.conv1 = nn.Sequential( + nn.Conv2d(1, self.channels, kernel_size=(2, 768),), + nn.ReLU(), + ) + self.conv2 = nn.Sequential( + nn.Conv2d(1, self.channels, kernel_size=(4, 768),), + nn.ReLU(), + ) + self.conv3 = nn.Sequential( + nn.Conv2d(1, self.channels, kernel_size=(16, 768),), + nn.ReLU(), + ) + + self.fc = nn.Sequential( + nn.LayerNorm(sum(self.pool_size)*self.channels), + nn.Dropout(0.5), + nn.Linear(sum(self.pool_size)*self.channels, self.input_size), + nn.ReLU(), + nn.LayerNorm(self.input_size), + nn.Dropout(0.5), + nn.Linear(self.input_size, self.output_size), + nn.ReLU() + ) + + def forward(self, x): + # array = [conv(x) for conv in self.conv] + x = torch.reshape(x, (-1, 1, self.input_size, 768)) + + # torch.view(-1, self.channels * self.input_size - size + 1) + x1 = self.conv1(x) + x2 = self.conv2(x) + x3 = self.conv3(x) + + x1 = x1.view(-1, self.channels*self.pool_size[0]) + x2 = x2.view(-1, self.channels*self.pool_size[1]) + x3 = x3.view(-1, self.channels*self.pool_size[2]) + + x = torch.cat([x1, x2, x3], -1) + x = x.view(-1, self.channels*sum(self.pool_size)) + # x = torch.concat(array, dim=1) + x = self.fc(x) + return x diff --git a/src/bert_attempts/README.md b/src/bert_attempts/README.md new file mode 100644 index 0000000..d86a573 --- /dev/null +++ b/src/bert_attempts/README.md @@ -0,0 +1,20 @@ +# Bert - like attempts + +## Current tasks +- [x] ! add model saving +- [x] ! add loss savings +- [x] ! limit the number of tests evaluations => speed-up the training +- [x] split the file on separate classes +- [x] !! change model to one in `Embedding.py` (regularization, parallel evaluation) +- [x] debug +- [x] separate data from code + +## Execution: +1. Start file - `BertBased.py` +2. The line with data generation: + +```python +generate_data(df_path, INPUT_SIZE, OUTPUT_SIZE) +``` + +It should be commented if the data is already generated (executes on CPU, takes ~1.5h) \ No newline at end of file diff --git a/src/bert_attempts/TripletLoss.py b/src/bert_attempts/TripletLoss.py new file mode 100644 index 0000000..875959a --- /dev/null +++ b/src/bert_attempts/TripletLoss.py @@ -0,0 +1,24 @@ +import torch +from torch import nn + +''' +with reference to https://www.kaggle.com/hirotaka0122/triplet-loss-with-pytorch +''' + + +class TripletLoss(nn.Module): + + def __init__(self, margin=0.1): + super(TripletLoss, self).__init__() + self.margin = margin + + @staticmethod + def calc_euclidean(x1, x2): + return (x1 - x2).pow(2).sum(1) + + def forward(self, anchor: torch.Tensor, positive: torch.Tensor, negative: torch.Tensor) -> torch.Tensor: + distance_positive = self.calc_euclidean(anchor, positive) + distance_negative = self.calc_euclidean(anchor, negative) + # strainge idea to use ReLU instead of max + losses = torch.relu(distance_positive - distance_negative + self.margin) + return losses.mean() diff --git a/src/main.py b/src/main.py index 5dfd8d8..aea8cf4 100644 --- a/src/main.py +++ b/src/main.py @@ -2,7 +2,7 @@ # from training.AvgTriplet import AverageTriplet from models.Embedding import Embedding # from models.Conv2D import Conv2D -from visualization.VisualizerTokenFeatures import VisualizerTokenFeatures +# from visualization.VisualizerTokenFeatures import VisualizerTokenFeatures # from visualization.VisualizerCharFeatures import VisualizerCharFeatures import tensorflow as tf @@ -11,8 +11,8 @@ for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) -model = Embedding(input_size=800, crop=200, output_size=50, make_initial_preprocess=True) -SingleTriplet(model=model).train(batch_size=16, epochs=40, epoch_start=0, step_start=0) +model = Embedding(input_size=800, crop=800, output_size=50, make_initial_preprocess=True) +SingleTriplet(model=model).train(batch_size=16, epochs=50, epoch_start=0, step_start=0) # VisualizerTokenFeatures().run() # model = Conv2D() diff --git a/src/models/Embedding.py b/src/models/Embedding.py index b1a4643..dcde9de 100644 --- a/src/models/Embedding.py +++ b/src/models/Embedding.py @@ -28,9 +28,10 @@ def create_after_emb(self, reshape1, conv_channels=2, emb_height=100, activation="relu", - L2_lambda=0.02, - conv_sizes=[2, 4, 16]): + L2_lambda=0.05, + conv_sizes=[2, 4, 8, 16]): # parallel piece + # d1 = layers.Dropout(0.3)(reshape1) convolutions = [layers.Conv2D(conv_channels, (conv_size, emb_height), name="conv2d_size_{}".format(conv_size), padding="same", activation=activation, @@ -60,8 +61,8 @@ def create_after_emb(self, reshape1, def create_model(self, activation: str = "relu", - L2_lambda: float = 0.02, - conv_sizes: List[int] = [2, 4, 16], + L2_lambda: float = 0.05, + conv_sizes: List[int] = [2, 4, 8, 16], emb_height: int = 100): conv_channels = 2 diff --git a/src/models/data_processing/TokenFeatures.py b/src/models/data_processing/TokenFeatures.py index 5b85a31..0113c2c 100644 --- a/src/models/data_processing/TokenFeatures.py +++ b/src/models/data_processing/TokenFeatures.py @@ -29,7 +29,7 @@ def __init__(self, @staticmethod def _write_vocab_file(filepath: str, vocab: List[str]): - with open(filepath, "w") as f: + with open(filepath, "w", encoding="utf-8") as f: for token in vocab: print(token, file=f) @@ -38,7 +38,7 @@ def _insert_tokens(x: str): x = x.replace("\n", " NLN ") x = x.replace("\t", " TAB ") x = x.replace(" ", " SPC ") - return x + return x.encode("utf-8") def initial_preprocess(self, df_path: str, tmp_dataset_filename: str): df = self._initial_load(df_path) @@ -61,11 +61,11 @@ def initial_preprocess(self, df_path: str, tmp_dataset_filename: str): # reduce the size of the dataset according to the n_tokens df.index = np.arange(len(df)) df["n_tokens"] = df.flines.apply(lambda x: tokenizer.tokenize(x).shape[0]) - df = df[df.n_tokens <= self.input_size] + # df = df[df.n_tokens <= self.input_size] # reindex df.index = np.arange(len(df)) # reduce size - df = self._user_selection_and_encoding(df, 50, 450) + df = self._user_selection_and_encoding(df, 0, 400) # long saving # The issue is that `tokenizer.tokenize()` do not always return a shape (-1, 1). # Some elements of the result of the function could be a list, e.g. [[2929, 8524]]. @@ -104,7 +104,6 @@ def secondary_preprocess(self, tmp_dataset_filename: str): test_indexes = np.where(tasks >= 7)[0] X_train, X_test = X[train_indexes], X[test_indexes] y_train, y_test = y[train_indexes], y[test_indexes] - # X_train, y_train = self._crop_to(X_train, y_train, rs1=(-1, self.crop), rs2=(-1, self.crop, 1)) # X_test, y_test = self._crop_to(X_test, y_test, rs1=(-1, self.crop), rs2=(-1, self.crop, 1)) # self.input_size = self.crop diff --git a/src/models/data_processing/base/DataLoading.py b/src/models/data_processing/base/DataLoading.py index f9d574f..a5a59f0 100644 --- a/src/models/data_processing/base/DataLoading.py +++ b/src/models/data_processing/base/DataLoading.py @@ -87,7 +87,7 @@ def _crop_to(self, return new_X, new_y def preprocess(self, - df_path: str = "../inputs/processed_dfs/cpp_9_tasks_2016.csv", + df_path: str = "../inputs/processed_dfs/valid_py_9_tasks_2020.csv", tmp_dataset_dir: str = "../inputs/preprocessed_jsons/") -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ diff --git a/src/visualization/base/Visualizer.py b/src/visualization/base/Visualizer.py index 750b466..5833267 100644 --- a/src/visualization/base/Visualizer.py +++ b/src/visualization/base/Visualizer.py @@ -23,7 +23,7 @@ def __init__(self, self.model_name = model_name self.snippet_index = snippet_index - self.model = tf.keras.models.load_model('../outputs/{}_0.h'.format(model_name)) + self.model = tf.keras.models.load_model('../outputs/{}_49.h'.format(model_name)) all_x, _, all_y, _ = data_loader.secondary_preprocess("../inputs/preprocessed_jsons/{}_train.json" .format(model_name)) self.triplet_type = AverageTriplet(self.model)