MefistAldemisov · MefistAldemisov · Feb 9, 2022 · Feb 9, 2022 · Feb 9, 2022 · Feb 11, 2022
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,9 @@ myenv*
 embd/*
 embd
 *.csv
+*.npy
+*.pkl
+*pt
 *.h5
 *.png
 *.json

diff --git a/requirements.txt b/requirements.txt
@@ -10,3 +10,6 @@ tqdm==4.58.0
 sentencepiece==0.1.95
 pydot==1.4.2
 tensorflow-text==2.5.0
+torch==1.10.0
+torchvision==0.11.1
+transformers==4.15.0
diff --git a/src/bert_attempts/AccuracyEvaluator.py b/src/bert_attempts/AccuracyEvaluator.py
@@ -0,0 +1,129 @@
+import io
+import datetime
+import numpy as np
+import torch
+import matplotlib.pyplot as plt
+
+from typing import List
+from sklearn.manifold import TSNE
+from sklearn.metrics import accuracy_score
+from sklearn.neighbors import KNeighborsClassifier
+
+class AccuracyEvaluator:
+
+    def __init__(self,
+                 X_train,
+                 X_test,
+                 y_train: np.ndarray,
+                 y_test: np.ndarray,
+                 threshold: float = 0.1,
+                 input_size: int = 500,
+                 authors: List = list(range(20))):
+        """
+        Parameters:
+        - `X_train`,` X_test` - np.arrays with data (tokens)
+        - `y_train`, `y_test` - np.arrays, labels (numerical representation of authors)
+
+        -  `threshold` - alpha parameter of the triplet loss, threshold for the classification's distance
+        -  `input_size` - amount of tokens in one file
+        -  `authors` - int, prediction stage requires the all-with-all comparison (O(n^2)),
+        that is why, it is reduced for plotting and evaluating
+        """
+        super().__init__()
+        self.threshold = threshold
+        self.input_size = input_size
+        # x-y preprocessing
+        self.authors = authors
+
+        def select_authors(initial_x, initial_y):
+            index = np.where(np.isin(initial_y, self.authors))[0]
+            new_x = initial_x[index]
+            new_y = initial_y[index]
+            return new_x, new_y
+
+        simple_x_train, simple_y_train = select_authors(X_train, y_train)
+        simple_x_test, simple_y_test = select_authors(X_test, y_test)
+
+        self.data = {
+            "simple": {
+                "train": [simple_x_train, simple_y_train],
+                "test": [simple_x_test, simple_y_test]
+            },
+            "full": {
+                "train": [X_train, y_train],
+                "test": [X_test, y_test]
+            }
+        }
+
+        # counter initialization
+        self.n = 0
+
+    @staticmethod
+    def _plot_to_image(figure):
+        # https://www.tensorflow.org/tensorboard/image_summaries
+        buf = io.BytesIO()
+        plt.savefig(buf, format="png")
+        plt.close(figure)
+        buf.seek(0)
+
+    def apply_dimensionality_reduction(self,
+                                       transformed_x,
+                                       y: np.ndarray,
+                                       epoch: int,
+                                       is_test: bool):
+        vectors = TSNE(n_components=2)
+        x_pca = vectors.fit_transform(transformed_x)
+        figure = plt.figure(figsize=(10, 8))
+        plt.title("Step {} (epoch {})".format(self.n, epoch))
+        for developer in self.authors:
+            indexes = np.where(y == developer)[0]
+            plt.plot(x_pca[indexes, 0], x_pca[indexes, 1], "o", ms=5)
+        # save as file
+        plt.savefig("outputs/tsne_{}.png".format( self.n))
+        # log to tensorboard
+        # image = self._plot_to_image(figure)
+        # writer = self.test_summary_writer if is_test else self.train_summary_writer
+        # with writer.as_default():
+        #     tf.summary.image("Distribution of authors", image, step=self.n)
+
+        plt.close("all")
+
+    def get_acc(self,
+                model,
+                x,
+                y: np.ndarray,
+                epoch: int,
+                is_test: bool,
+                dim_red: True) -> float:
+        with torch.no_grad():
+            transformed_x = model(x)
+        knn = KNeighborsClassifier().fit(transformed_x, y)
+        predictions = knn.predict(transformed_x)
+        accuracy = accuracy_score(y_true=y, y_pred=predictions)
+        if dim_red:
+            self.apply_dimensionality_reduction(transformed_x, y, epoch, is_test)
+        return accuracy
+
+    def _writer(self,
+                x,
+                y,
+                model,
+                epoch: int,
+                is_test: bool,
+                is_simple: bool) -> float:
+
+        accuracy = self.get_acc(model, x, y, epoch, is_test, is_simple)
+        return accuracy
+
+    def on_epoch_end(self,
+                     model,
+                     epoch: int,
+                     loss: float):
+
+        aste = self._writer(*self.data["simple"]["test"], model, epoch, True, True)
+        astr = self._writer(*self.data["simple"]["train"], model, epoch, False, True)
+        afte = self._writer(*self.data["full"]["test"], model, epoch, True, False)
+
+        print(loss,astr, aste, afte)
+        self.n += 1
+        return astr, aste, afte
diff --git a/src/bert_attempts/BertBased.py b/src/bert_attempts/BertBased.py
@@ -0,0 +1,113 @@
+import os
+import tqdm
+import torch
+import pickle
+
+import numpy as np
+import torch.nn as nn
+import torch.optim as optim
+
+from sklearn.neighbors import BallTree
+
+from AccuracyEvaluator import AccuracyEvaluator
+from GCJ import GCJ
+from Network import Network
+from TripletLoss import TripletLoss
+from DataGenerator import generate_data
+
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+if device.type == "cuda":
+    torch.cuda.get_device_name()
+
+# -------------------------- constants
+df_path = '../../inputs/processed_dfs/cpp_9_tasks_2016.csv'
+DATA_PATH = './data/'
+TRAIN_PATH = './train/'
+
+INPUT_SIZE = 512  # 514 tokens, maximum for bert
+OUTPUT_SIZE = 256
+N_EPOCHS = 30
+BATCH_SIZE = 16
+
+
+def mkdir(dir_name):
+    # create dirs
+    try:
+        os.makedirs(dir_name)
+    except FileExistsError:
+        print('Dir exist')
+
+
+def init_weights(m):
+    if isinstance(m, nn.Conv2d):
+        torch.nn.init.xavier_normal_(m.weight)
+
+
+mkdir(DATA_PATH)
+mkdir(TRAIN_PATH)
+
+model = Network(INPUT_SIZE, OUTPUT_SIZE)
+model.apply(init_weights)
+model = torch.jit.script(model).to(device)
+
+# generate_data(df_path, DATA_PATH, INPUT_SIZE, BATCH_SIZE=64)
+
+X_train = np.load(DATA_PATH + 'x_train.np.npy')
+y_test = np.load(DATA_PATH + 'y_test.np.npy')
+y_train = np.load(DATA_PATH + 'y_train.np.npy')
+X_test = np.load(DATA_PATH + 'x_test.np.npy')
+x_emb = torch.load(DATA_PATH + 'test_tensor.pt')
+x_train_emb = torch.load(DATA_PATH + 'train_tensor.pt')
+x_emb = torch.reshape(x_emb, (-1, 512, 768))
+x_train_emb = torch.reshape(x_train_emb, (-1, 512, 768))
+
+
+data_loader = GCJ(x_train_emb, y_train, BATCH_SIZE, INPUT_SIZE)
+
+
+tree = None  # default value
+
+optimizer = optim.Adam(model.parameters(), lr=10**(-6), weight_decay=0.05)
+criterion = torch.jit.script(TripletLoss())
+# todo: check, why
+x_emb = x_emb[:X_test.shape[0]]
+callback = AccuracyEvaluator(x_train_emb, x_emb, y_train, y_test, input_size=768)
+
+# training loop
+model.train()
+params = []
+for epoch in tqdm.tqdm(range(N_EPOCHS), desc="Epochs"):
+    running_loss = []
+    for step in tqdm.tqdm(range(np.unique(y_train).shape[0]), desc="Training", leave=False):
+        anchor, positive, negative = data_loader.batch_generator(model, tree)
+
+        optimizer.zero_grad()
+
+        anchor_out = model(anchor)
+        positive_out = model(positive)
+        negative_out = model(negative)
+
+        loss = criterion(anchor_out, positive_out, negative_out)
+        loss.backward()
+        optimizer.step()
+
+        if step % 10 == 0:
+            #with torch.no_grad():
+            #    predictions = model(x_train_emb)
+            #tree = BallTree(predictions, metric="euclidean")
+
+            current_loss = loss.cpu().detach().numpy()
+            running_loss.append(current_loss)
+
+            # callback (accuracy)
+            metrics = callback.on_epoch_end(model, epoch, current_loss)
+            print(metrics)
+            params.append(metrics)
+            with open(TRAIN_PATH + 'training.pkl', 'wb') as f:
+                pickle.dump(params, f)
+
+        torch.save(model.state_dict(), TRAIN_PATH + 'model')
+
+    print("Epoch: {}/{} - Loss: {:.4f}".format(epoch + 1, N_EPOCHS, np.mean(running_loss)))
diff --git a/src/bert_attempts/DataGenerator.py b/src/bert_attempts/DataGenerator.py
@@ -0,0 +1,75 @@
+import torch
+import tqdm
+
+import pandas as pd
+import numpy as np
+
+from sklearn.preprocessing import LabelEncoder
+from transformers import RobertaTokenizer, RobertaModel
+
+
+def generate_data(df_path: str, data_path: str, INPUT_SIZE: int, BATCH_SIZE: int):
+    df = pd.read_csv(df_path)
+    # df = df.drop(columns=["round", "task", "solution", "file",
+    #                       "full_path", "Unnamed: 0.1", "Unnamed: 0", "lang"])
+    # df["n_lines"] = df.flines.apply(lambda x: str(x).count("\n"))
+    # df = df[(df.n_lines > 0)]
+
+    # def _insert_tokens(x: str):
+    #     x = x.replace("\n", " NLN ")
+    #     x = x.replace("\t", " TAB ")
+    #     x = x.replace(" ", " SPC ")
+    #     return x
+    #
+    # df.flines = df.flines.apply(_insert_tokens)
+
+    # load tokenizer
+    tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm")
+    df.index = np.arange(len(df))
+    le = LabelEncoder()
+    df.user = le.fit_transform(df.user)
+    df['tokens'] = df.flines.apply(lambda x: tokenizer
+                                   .convert_tokens_to_ids(tokenizer.tokenize(x)))
+
+    dataset = df[["user", "tokens", "task"]]
+    # shuffle dataset
+    dataset = dataset.sample(frac=1)
+
+    X = dataset.tokens.values
+
+    def fill_zeros(arr):
+        arr = np.array(arr)
+        if INPUT_SIZE > arr.shape[0]:
+            arr = np.pad(arr, (0, INPUT_SIZE - arr.shape[0]), 'constant')
+        else:
+            arr = arr[:INPUT_SIZE]
+        return arr.reshape(INPUT_SIZE, 1).tolist()
+
+    X = np.array([fill_zeros(x) for x in X])
+    X = X.reshape((-1, INPUT_SIZE))
+    y = np.array(dataset.user)
+    tasks = np.array(dataset.task)
+    train_indexes = np.where(tasks < 7)[0]
+    test_indexes = np.where(tasks >= 7)[0]
+    X_train, X_test = X[train_indexes], X[test_indexes]
+    y_train, y_test = y[train_indexes], y[test_indexes]
+
+    embedding_model = RobertaModel.from_pretrained("microsoft/codebert-base")
+
+    def get_embedding(data):
+        emb = []
+        with torch.no_grad():
+            for i in tqdm.tqdm(range(0, data.shape[0], BATCH_SIZE)):
+                batch = data[i: i+BATCH_SIZE]
+                new_part = embedding_model(torch.from_numpy(batch)).last_hidden_state
+                emb = [*emb, *new_part]
+        return emb
+
+    x_emb = get_embedding(X_test)
+    np.save(data_path + 'x_train.np', X_train)
+    np.save(data_path + 'y_test.np', y_test)
+    np.save(data_path + 'y_train.np', y_train)
+    np.save(data_path + 'x_test.np', X_test)
+    torch.save(torch.cat(x_emb), data_path + 'test_tensor.pt')
+    x_train_emb = get_embedding(X_train)
+    torch.save(torch.cat(x_train_emb), data_path + 'train_tensor.pt')
diff --git a/src/bert_attempts/GCJ.py b/src/bert_attempts/GCJ.py
@@ -0,0 +1,45 @@
+import torch
+import numpy as np
+
+'''
+The loader of the train data (batch generator)
+'''
+
+
+class GCJ:
+
+    def __init__(self, X_train, y_train, batch_size, input_size):
+        self.x = X_train
+        self.y = y_train
+        self.batch_size = batch_size
+        self.input_size = input_size
+
+    def batch_generator(self, model, tree):
+        n_positive = self.batch_size // 2
+        anchor_index = np.random.choice(self.y.shape[0], 1)
+        y_anchor = self.y[anchor_index]
+        positive_indexes = np.where(self.y == y_anchor)[0]
+        n_same = positive_indexes.shape[0]
+        positive_indexes = positive_indexes[:n_positive]
+        k = self.batch_size - positive_indexes.shape[0]
+
+        if tree is not None:
+            with torch.no_grad():
+                query = model(self.x[anchor_index])
+            query_res = tree.query(query, self.batch_size+n_same, return_distance=False)[0]
+            negative_indexes = np.array([neighbour_index for neighbour_index in query_res
+                                         if self.y[neighbour_index] != y_anchor])[:k]
+        else:  # the first batch generation
+            negative_indexes = np.where(self.y != y_anchor)[0]
+            np.random.shuffle(negative_indexes)
+            negative_indexes = negative_indexes[:k]
+
+        local_x = self.x.reshape((-1, self.input_size, 768))
+
+        reduced_indexes = map(lambda indexes: np.random.choice(indexes, self.batch_size),
+                              [positive_indexes, negative_indexes])
+
+        positive, negative = map(lambda i: local_x[i], reduced_indexes)
+        anchor = torch.concat([local_x[anchor_index] for _ in range(self.batch_size)])
+
+        return anchor, positive, negative
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,6 +8,9 @@ myenv* @@
     embd/*
     embd
     *.csv
+    *.npy
+    *.pkl
+    *pt
     *.h5
     *.png
     *.json
@@ Expand Down @@