From 907a4c189cfea03ef460f0ffd70a35fa5ea273e1 Mon Sep 17 00:00:00 2001
From: MefAldemisov <alinnamef@gmail.com>
Date: Wed, 9 Feb 2022 13:21:03 +0300
Subject: [PATCH 1/9] Code-bert based model draft

---
 requirements.txt                       |   3 +
 src/bert_attempts/AccuracyEvaluator.py | 128 +++++++++++++
 src/bert_attempts/BertBased.py         | 243 +++++++++++++++++++++++++
 3 files changed, 374 insertions(+)
 create mode 100644 src/bert_attempts/AccuracyEvaluator.py
 create mode 100644 src/bert_attempts/BertBased.py

diff --git a/requirements.txt b/requirements.txt
index f5315eb..cb6c546 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,3 +10,6 @@ tqdm==4.58.0
 sentencepiece==0.1.95
 pydot==1.4.2
 tensorflow-text==2.5.0
+torch==1.10.0
+torchvision==0.11.1
+transformers==4.15.0
diff --git a/src/bert_attempts/AccuracyEvaluator.py b/src/bert_attempts/AccuracyEvaluator.py
new file mode 100644
index 0000000..a114301
--- /dev/null
+++ b/src/bert_attempts/AccuracyEvaluator.py
@@ -0,0 +1,128 @@
+import io
+import datetime
+import numpy as np
+import torch
+import matplotlib.pyplot as plt
+
+from typing import List
+from sklearn.manifold import TSNE
+from sklearn.metrics import accuracy_score
+from sklearn.neighbors import KNeighborsClassifier
+
+class AccuracyEvaluator:
+
+    def __init__(self,
+                 # X_train: np.ndarray,
+                 X_test: np.ndarray,
+                 # y_train: np.ndarray,
+                 y_test: np.ndarray,
+                 threshold: float = 0.1,
+                 input_size: int = 500,
+                 authors: List = list(range(20))):
+        """
+        Parameters:
+        - `X_train`,` X_test` - np.arrays with data (tokens)
+        - `y_train`, `y_test` - np.arrays, labels (numerical representation of authors)
+
+        -  `threshold` - alpha parameter of the triplet loss, threshold for the classification's distance
+        -  `input_size` - amount of tokens in one file
+        -  `authors` - int, prediction stage requires the all-with-all comparison (O(n^2)),
+        that is why, it is reduced for plotting and evaluating
+        """
+        super().__init__()
+        self.threshold = threshold
+        self.input_size = input_size
+        # x-y preprocessing
+        self.authors = authors
+
+        def select_authors(initial_x, initial_y):
+            index = np.where(np.isin(initial_y, self.authors))[0]
+            new_x, new_y = map(lambda a: a[index], [initial_x, initial_y])
+            return new_x, new_y
+
+        # simple_x_train, simple_y_train = select_authors(X_train, y_train)
+        simple_x_test, simple_y_test = select_authors(X_test, y_test)
+
+        self.data = {
+            "simple": {
+                # "train": [simple_x_train, simple_y_train],
+                "test": [simple_x_test, simple_y_test]
+            },
+            "full": {
+                # "train": [X_train, y_train],
+                "test": [X_test, y_test]
+            }
+        }
+
+        # counter initialization
+        self.n = 0
+
+    @staticmethod
+    def _plot_to_image(figure):
+        # https://www.tensorflow.org/tensorboard/image_summaries
+        buf = io.BytesIO()
+        plt.savefig(buf, format="png")
+        plt.close(figure)
+        buf.seek(0)
+
+    def apply_dimensionality_reduction(self,
+                                       transformed_x: np.ndarray,
+                                       y: np.ndarray,
+                                       epoch: int,
+                                       is_test: bool):
+        vectors = TSNE(n_components=2)
+        x_pca = vectors.fit_transform(transformed_x)
+        figure = plt.figure(figsize=(10, 8))
+        plt.title("Step {} (epoch {})".format(self.n, epoch))
+        for developer in self.authors:
+            indexes = np.where(y == developer)[0]
+            plt.plot(x_pca[indexes, 0], x_pca[indexes, 1], "o", ms=5)
+        # save as file
+        plt.savefig("../outputs/tsne_{}/tsne_{}.png".format('bert', self.n))
+        # log to tensorboard
+        # image = self._plot_to_image(figure)
+        # writer = self.test_summary_writer if is_test else self.train_summary_writer
+        # with writer.as_default():
+        #     tf.summary.image("Distribution of authors", image, step=self.n)
+
+        plt.close("all")
+
+    def get_acc(self,
+                model,
+                x: np.ndarray,
+                y: np.ndarray,
+                epoch: int,
+                is_test: bool,
+                dim_red: True) -> float:
+
+        transformed_x = model(x)
+        knn = KNeighborsClassifier().fit(transformed_x, y)
+        predictions = knn.predict(transformed_x)
+        accuracy = accuracy_score(y_true=y, y_pred=predictions)
+        # if dim_red:
+        #     self.apply_dimensionality_reduction(transformed_x, y, epoch, is_test)
+        return accuracy
+
+    def _writer(self,
+                x,
+                y,
+                model,
+                epoch: int,
+                is_test: bool,
+                is_simple: bool) -> float:
+
+        accuracy = self.get_acc(model, x, y, epoch, is_test, is_simple)
+        return accuracy
+
+    def on_epoch_end(self,
+                     model,
+                     epoch: int,
+                     loss: float):
+
+        # astr = self._writer(*self.data["simple"]["train"], model, epoch, False, True)
+        aste = self._writer(*self.data["simple"]["test"], model, epoch, True, True)
+        afte = self._writer(*self.data["full"]["test"], model, epoch, True, False)
+
+        print(loss, aste, afte)
+        self.n += 1
+        return aste, afte
diff --git a/src/bert_attempts/BertBased.py b/src/bert_attempts/BertBased.py
new file mode 100644
index 0000000..2dcb1e5
--- /dev/null
+++ b/src/bert_attempts/BertBased.py
@@ -0,0 +1,243 @@
+from transformers import RobertaTokenizer, RobertaModel
+import tqdm
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from AccuracyEvaluator import AccuracyEvaluator
+from sklearn.neighbors import BallTree
+from sklearn.preprocessing import LabelEncoder
+
+# with reference to https://www.kaggle.com/hirotaka0122/triplet-loss-with-pytorch
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+if device.type == "cuda":
+    torch.cuda.get_device_name()
+
+# -------------------------- constants
+df_path = '../../inputs/processed_dfs/cpp_9_tasks_2016.csv'
+tmp_dataset_dir = "../../inputs/preprocessed_jsons/"
+tmp_dataset_filename = tmp_dataset_dir + 'bert' + "_train.json"
+
+INPUT_SIZE = 512 # 514 tokens, maximum for bert
+OUTPUT_SIZE = 256
+N_EPOCHS = 100
+BATCH_SIZE = 16
+
+# -------------------------- load data
+df = pd.read_csv(df_path)
+# df = df.drop(columns=["round", "task", "solution", "file",
+#                       "full_path", "Unnamed: 0.1", "Unnamed: 0", "lang"])
+# df["n_lines"] = df.flines.apply(lambda x: str(x).count("\n"))
+# df = df[(df.n_lines > 0)]
+
+
+def _insert_tokens(x: str):
+    x = x.replace("\n", " NLN ")
+    x = x.replace("\t", " TAB ")
+    x = x.replace(" ", " SPC ")
+    return x
+
+df.flines = df.flines.apply(_insert_tokens)
+
+# load tokenizer
+tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm")
+df.index = np.arange(len(df))
+le = LabelEncoder()
+df.user = le.fit_transform(df.user)
+df['tokens'] = df.flines.apply(lambda x:
+                              tokenizer.convert_tokens_to_ids(
+                                  tokenizer.tokenize(x)))
+
+dataset = df[["user", "tokens", "task"]]
+# shuffle dataset
+dataset = dataset.sample(frac=1)
+
+X = dataset.tokens.values
+
+def fillZeros(arr):
+    arr = np.array(arr)
+    if INPUT_SIZE > arr.shape[0]:
+        arr = np.pad(arr, (0, INPUT_SIZE - arr.shape[0]), 'constant')
+    else:
+        arr = arr[:INPUT_SIZE]
+    return arr.reshape(INPUT_SIZE, 1).tolist()
+
+X = np.array([fillZeros(x) for x in X])
+X = X.reshape((-1, INPUT_SIZE))
+y = np.array(dataset.user)
+tasks = np.array(dataset.task)
+train_indexes = np.where(tasks < 7)[0]
+test_indexes = np.where(tasks >= 7)[0]
+X_train, X_test = X[train_indexes], X[test_indexes]
+y_train, y_test = y[train_indexes], y[test_indexes] # 244 unique person
+
+# -------------------------- model architecture
+
+# let's do just a simple thing
+
+# 1. embedding from bert -> INPUT_SIZE * 768
+# 2. convolution (5*768)
+# 3. fully-connected 500
+# 4. fully connected 100
+
+# 1. pretrained part
+
+# train loader
+class GCJ:
+    def __init__(self, X_train, y_train, batch_size = BATCH_SIZE):
+        self.x = X_train
+        self.y = y_train
+        self.batch_size = batch_size
+
+    def batch_generator(self, model, tree):
+        n_positive = self.batch_size // 2
+        anchor_index = np.random.choice(self.y.shape[0], 1)
+        y_anchor = y[anchor_index]
+        positive_indexes = np.where(self.y == y_anchor)[0]
+        n_same = positive_indexes.shape[0]
+        positive_indexes = positive_indexes[:n_positive]
+        k = self.batch_size - positive_indexes.shape[0]
+
+        if tree is not None:
+            query = model(self.x[anchor_index])
+            query_res = tree.query(query, self.batch_size+n_same, return_distance=False)[0]
+            negative_indexes = np.array([neighbour_index for neighbour_index in query_res
+                                         if self.y[neighbour_index] != y_anchor])[:k]
+        else:  # the first batch generation
+            negative_indexes = np.where(self.y != y_anchor)[0]
+            np.random.shuffle(negative_indexes)
+            negative_indexes = negative_indexes[:k]
+
+        local_x = self.x.reshape((-1, INPUT_SIZE))
+
+        reduced_indexes = map(lambda indexes: np.random.choice(indexes, self.batch_size),
+                              [positive_indexes, negative_indexes])
+
+        positive, negative = map(lambda i: local_x[i], reduced_indexes)
+        anchor = np.array([local_x[anchor_index] for _ in range(self.batch_size)]).reshape((-1, INPUT_SIZE))
+
+        return anchor, positive, negative
+
+    def generator(self, model, tree):
+            while True:
+                yield self.batch_generator(model, tree)
+
+
+# model
+
+class Network(nn.Module):
+    def __init__(self):
+        super(Network, self).__init__()
+        # conv_sizes = [2, 4, 16]
+        k_size = 8
+        self.pool_size = INPUT_SIZE - k_size + 1 # output for conv
+        self.channels = 4
+        self.conv = nn.Sequential(
+                nn.Conv2d(1, self.channels, kernel_size=(k_size, 768),),
+                nn.ReLU(),
+            )
+        #     for size in conv_sizes
+        # ]
+        self.fc = nn.Sequential(
+            nn.Linear(self.pool_size*self.channels, INPUT_SIZE),
+            nn.ReLU(),
+            nn.Linear(INPUT_SIZE, OUTPUT_SIZE),
+            nn.ReLU()
+        )
+
+    def forward(self, x):
+        # array = [conv(x) for conv in self.conv]
+        x = torch.reshape(x, (-1, 1, 512, 768))
+        x = self.conv(x)
+        x = x.view(-1, self.channels*self.pool_size)
+        # x = torch.concat(array, dim=1)
+        x = self.fc(x)
+        return x
+
+#
+
+# configs
+class TripletLoss(nn.Module):
+    def __init__(self, margin=1.0):
+        super(TripletLoss, self).__init__()
+        self.margin = margin
+
+    def calc_euclidean(self, x1, x2):
+        return (x1 - x2).pow(2).sum(1)
+
+    def forward(self, anchor: torch.Tensor, positive: torch.Tensor, negative: torch.Tensor) -> torch.Tensor:
+        distance_positive = self.calc_euclidean(anchor, positive)
+        distance_negative = self.calc_euclidean(anchor, negative)
+        losses = torch.relu(distance_positive - distance_negative + self.margin)
+
+        return losses.mean()
+
+def init_weights(m):
+    if isinstance(m, nn.Conv2d):
+        torch.nn.init.xavier_normal_(m.weight)
+
+
+data_loader = GCJ(X_train, y_train)
+
+embedding_model = RobertaModel.from_pretrained("microsoft/codebert-base")
+
+model = Network()
+model.apply(init_weights)
+model = torch.jit.script(model).to(device)
+
+tree = None # default value
+
+optimizer = optim.Adam(model.parameters(), lr=0.001)
+criterion = torch.jit.script(TripletLoss())
+
+
+# test_emb = embedding_model(torch.from_numpy(X_test)).last_hidden_state
+x_emb = []
+for i in tqdm.tqdm(range(0, X_test.shape[0], BATCH_SIZE)):
+    xs = X_train[i: i+BATCH_SIZE]
+    new_xs = embedding_model(torch.from_numpy(xs)).last_hidden_state
+    x_emb = [*x_emb, *new_xs]
+
+x_emb = np.array(x_emb)
+# Connected to pydev debugger (build 213.5744.248)
+#  16%|█▌        | 5/31 [03:37<18:48, 43.42s/it]/usr/local/Cellar/python@3.9/3.9.7/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown
+#   warnings.warn('resource_tracker: There appear to be %d '
+#
+# Process finished with exit code 137 (interrupted by signal 9: SIGKILL)
+callback = AccuracyEvaluator(x_emb, y_test, input_size=768)
+
+# training loop
+model.train()
+for epoch in tqdm.tqdm(range(N_EPOCHS), desc="Epochs"):
+    running_loss = []
+    for step in enumerate(tqdm.tqdm(range(len(np.unique(y_train))), desc="Training", leave=False)):
+        anchor, positive, negative = data_loader.batch_generator(model, tree)
+        anchor = embedding_model(anchor).last_hidden_state
+        positive = embedding_model(positive).last_hidden_state
+        negative = embedding_model(negative).last_hidden_state
+
+        optimizer.zero_grad()
+
+        anchor_out = model(anchor)
+        positive_out = model(positive)
+        negative_out = model(negative)
+
+        loss = criterion(anchor_out, positive_out, negative_out)
+        loss.backward()
+        optimizer.step()
+
+        # predictions = model(x_emb)
+        # tree = BallTree(predictions, metric="euclidean")
+
+        current_loss = loss.cpu().detach().numpy()
+        print(current_loss)
+        running_loss.append(current_loss)
+
+        # callback (accuracy)
+        metrics = callback.on_epoch_end(model, epoch, current_loss)
+        print(metrics)
+
+    print("Epoch: {}/{} - Loss: {:.4f}".format(epoch + 1, N_EPOCHS, np.mean(running_loss)))
\ No newline at end of file

From 93839ca04b8773a81934ddf0697b0cea45e46c1c Mon Sep 17 00:00:00 2001
From: MefAldemisov <alinnamef@gmail.com>
Date: Wed, 9 Feb 2022 18:10:14 +0300
Subject: [PATCH 2/9] Simple model is ready for training

---
 src/bert_attempts/AccuracyEvaluator.py |  11 +-
 src/bert_attempts/BertBased.py         | 150 ++++++++++++++-----------
 2 files changed, 90 insertions(+), 71 deletions(-)

diff --git a/src/bert_attempts/AccuracyEvaluator.py b/src/bert_attempts/AccuracyEvaluator.py
index a114301..2171d53 100644
--- a/src/bert_attempts/AccuracyEvaluator.py
+++ b/src/bert_attempts/AccuracyEvaluator.py
@@ -13,7 +13,7 @@ class AccuracyEvaluator:
 
     def __init__(self,
                  # X_train: np.ndarray,
-                 X_test: np.ndarray,
+                 X_test,
                  # y_train: np.ndarray,
                  y_test: np.ndarray,
                  threshold: float = 0.1,
@@ -37,7 +37,8 @@ def __init__(self,
 
         def select_authors(initial_x, initial_y):
             index = np.where(np.isin(initial_y, self.authors))[0]
-            new_x, new_y = map(lambda a: a[index], [initial_x, initial_y])
+            new_x = [initial_x[i] for i in index]
+            new_y = initial_y[index]
             return new_x, new_y
 
         # simple_x_train, simple_y_train = select_authors(X_train, y_train)
@@ -89,13 +90,13 @@ def apply_dimensionality_reduction(self,
 
     def get_acc(self,
                 model,
-                x: np.ndarray,
+                x,
                 y: np.ndarray,
                 epoch: int,
                 is_test: bool,
                 dim_red: True) -> float:
-
-        transformed_x = model(x)
+        with torch.no_grad():
+            transformed_x = model(torch.cat(x))
         knn = KNeighborsClassifier().fit(transformed_x, y)
         predictions = knn.predict(transformed_x)
         accuracy = accuracy_score(y_true=y, y_pred=predictions)
diff --git a/src/bert_attempts/BertBased.py b/src/bert_attempts/BertBased.py
index 2dcb1e5..533d816 100644
--- a/src/bert_attempts/BertBased.py
+++ b/src/bert_attempts/BertBased.py
@@ -8,7 +8,6 @@
 from AccuracyEvaluator import AccuracyEvaluator
 from sklearn.neighbors import BallTree
 from sklearn.preprocessing import LabelEncoder
-
 # with reference to https://www.kaggle.com/hirotaka0122/triplet-loss-with-pytorch
 
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -21,58 +20,90 @@
 tmp_dataset_dir = "../../inputs/preprocessed_jsons/"
 tmp_dataset_filename = tmp_dataset_dir + 'bert' + "_train.json"
 
-INPUT_SIZE = 512 # 514 tokens, maximum for bert
+INPUT_SIZE = 512  # 514 tokens, maximum for bert
 OUTPUT_SIZE = 256
 N_EPOCHS = 100
 BATCH_SIZE = 16
 
 # -------------------------- load data
-df = pd.read_csv(df_path)
-# df = df.drop(columns=["round", "task", "solution", "file",
-#                       "full_path", "Unnamed: 0.1", "Unnamed: 0", "lang"])
-# df["n_lines"] = df.flines.apply(lambda x: str(x).count("\n"))
-# df = df[(df.n_lines > 0)]
-
-
-def _insert_tokens(x: str):
-    x = x.replace("\n", " NLN ")
-    x = x.replace("\t", " TAB ")
-    x = x.replace(" ", " SPC ")
-    return x
-
-df.flines = df.flines.apply(_insert_tokens)
-
-# load tokenizer
-tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm")
-df.index = np.arange(len(df))
-le = LabelEncoder()
-df.user = le.fit_transform(df.user)
-df['tokens'] = df.flines.apply(lambda x:
-                              tokenizer.convert_tokens_to_ids(
-                                  tokenizer.tokenize(x)))
-
-dataset = df[["user", "tokens", "task"]]
-# shuffle dataset
-dataset = dataset.sample(frac=1)
-
-X = dataset.tokens.values
-
-def fillZeros(arr):
-    arr = np.array(arr)
-    if INPUT_SIZE > arr.shape[0]:
-        arr = np.pad(arr, (0, INPUT_SIZE - arr.shape[0]), 'constant')
-    else:
-        arr = arr[:INPUT_SIZE]
-    return arr.reshape(INPUT_SIZE, 1).tolist()
-
-X = np.array([fillZeros(x) for x in X])
-X = X.reshape((-1, INPUT_SIZE))
-y = np.array(dataset.user)
-tasks = np.array(dataset.task)
-train_indexes = np.where(tasks < 7)[0]
-test_indexes = np.where(tasks >= 7)[0]
-X_train, X_test = X[train_indexes], X[test_indexes]
-y_train, y_test = y[train_indexes], y[test_indexes] # 244 unique person
+
+
+def generate_data():
+    df = pd.read_csv(df_path)
+    # df = df.drop(columns=["round", "task", "solution", "file",
+    #                       "full_path", "Unnamed: 0.1", "Unnamed: 0", "lang"])
+    # df["n_lines"] = df.flines.apply(lambda x: str(x).count("\n"))
+    # df = df[(df.n_lines > 0)]
+
+
+    # def _insert_tokens(x: str):
+    #     x = x.replace("\n", " NLN ")
+    #     x = x.replace("\t", " TAB ")
+    #     x = x.replace(" ", " SPC ")
+    #     return x
+    #
+    # df.flines = df.flines.apply(_insert_tokens)
+
+    # load tokenizer
+    tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm")
+    df.index = np.arange(len(df))
+    le = LabelEncoder()
+    df.user = le.fit_transform(df.user)
+    df['tokens'] = df.flines.apply(lambda x:
+                                  tokenizer.convert_tokens_to_ids(
+                                      tokenizer.tokenize(x)))
+
+    dataset = df[["user", "tokens", "task"]]
+    # shuffle dataset
+    dataset = dataset.sample(frac=1)
+
+    X = dataset.tokens.values
+
+    def fillZeros(arr):
+        arr = np.array(arr)
+        if INPUT_SIZE > arr.shape[0]:
+            arr = np.pad(arr, (0, INPUT_SIZE - arr.shape[0]), 'constant')
+        else:
+            arr = arr[:INPUT_SIZE]
+        return arr.reshape(INPUT_SIZE, 1).tolist()
+
+    X = np.array([fillZeros(x) for x in X])
+    X = X.reshape((-1, INPUT_SIZE))
+    y = np.array(dataset.user)
+    tasks = np.array(dataset.task)
+    train_indexes = np.where(tasks < 7)[0]
+    test_indexes = np.where(tasks >= 7)[0]
+    X_train, X_test = X[train_indexes], X[test_indexes]
+    y_train, y_test = y[train_indexes], y[test_indexes] # 244 unique person
+
+    embedding_model = RobertaModel.from_pretrained("microsoft/codebert-base")
+
+    x_emb = []
+    with torch.no_grad():
+        for i in tqdm.tqdm(range(0, X_test.shape[0], BATCH_SIZE)):
+            xs = X_train[i: i+BATCH_SIZE]
+            new_xs = embedding_model(torch.from_numpy(xs)).last_hidden_state
+            x_emb = [*x_emb, *new_xs]
+
+    # save x_emb, x_train, y_test, y_train
+
+    np.save('x_train.np', X_train)
+    np.save('y_test.np', y_test)
+    np.save('y_train.np', y_train)
+    np.save('x_test.np', X_test)
+
+    for idx, tensor in enumerate(x_emb):
+        torch.save(tensor, f"test_tensors/tensor{idx}.pt")
+
+
+# generate_data()
+print('restoring')
+
+X_train = np.load('x_train.np.npy')
+y_test = np.load('y_test.np.npy')
+y_train = np.load('y_train.np.npy')
+X_test = np.load('x_test.np.npy')
+x_emb = [torch.load(f"test_tensors/tensor{idx}.pt") for idx in range(X_test.shape[0])]
 
 # -------------------------- model architecture
 
@@ -95,7 +126,7 @@ def __init__(self, X_train, y_train, batch_size = BATCH_SIZE):
     def batch_generator(self, model, tree):
         n_positive = self.batch_size // 2
         anchor_index = np.random.choice(self.y.shape[0], 1)
-        y_anchor = y[anchor_index]
+        y_anchor = self.y[anchor_index]
         positive_indexes = np.where(self.y == y_anchor)[0]
         n_same = positive_indexes.shape[0]
         positive_indexes = positive_indexes[:n_positive]
@@ -193,20 +224,6 @@ def init_weights(m):
 optimizer = optim.Adam(model.parameters(), lr=0.001)
 criterion = torch.jit.script(TripletLoss())
 
-
-# test_emb = embedding_model(torch.from_numpy(X_test)).last_hidden_state
-x_emb = []
-for i in tqdm.tqdm(range(0, X_test.shape[0], BATCH_SIZE)):
-    xs = X_train[i: i+BATCH_SIZE]
-    new_xs = embedding_model(torch.from_numpy(xs)).last_hidden_state
-    x_emb = [*x_emb, *new_xs]
-
-x_emb = np.array(x_emb)
-# Connected to pydev debugger (build 213.5744.248)
-#  16%|█▌        | 5/31 [03:37<18:48, 43.42s/it]/usr/local/Cellar/python@3.9/3.9.7/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown
-#   warnings.warn('resource_tracker: There appear to be %d '
-#
-# Process finished with exit code 137 (interrupted by signal 9: SIGKILL)
 callback = AccuracyEvaluator(x_emb, y_test, input_size=768)
 
 # training loop
@@ -215,9 +232,10 @@ def init_weights(m):
     running_loss = []
     for step in enumerate(tqdm.tqdm(range(len(np.unique(y_train))), desc="Training", leave=False)):
         anchor, positive, negative = data_loader.batch_generator(model, tree)
-        anchor = embedding_model(anchor).last_hidden_state
-        positive = embedding_model(positive).last_hidden_state
-        negative = embedding_model(negative).last_hidden_state
+        with torch.no_grad():
+            anchor = embedding_model(torch.from_numpy(anchor)).last_hidden_state
+            positive = embedding_model(torch.from_numpy(positive)).last_hidden_state
+            negative = embedding_model(torch.from_numpy(negative)).last_hidden_state
 
         optimizer.zero_grad()
 

From 77791ca1484a9e737d2bc3cdf1f7bc3b86c53bec Mon Sep 17 00:00:00 2001
From: MefAldemisov <alinnamef@gmail.com>
Date: Wed, 9 Feb 2022 21:58:12 +0300
Subject: [PATCH 3/9] Added embeddings for training set

---
 src/bert_attempts/AccuracyEvaluator.py | 27 ++++++++++++++------------
 src/bert_attempts/BertBased.py         | 21 ++++++++++++++++----
 2 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/src/bert_attempts/AccuracyEvaluator.py b/src/bert_attempts/AccuracyEvaluator.py
index 2171d53..376b237 100644
--- a/src/bert_attempts/AccuracyEvaluator.py
+++ b/src/bert_attempts/AccuracyEvaluator.py
@@ -12,9 +12,9 @@
 class AccuracyEvaluator:
 
     def __init__(self,
-                 # X_train: np.ndarray,
+                 X_train,
                  X_test,
-                 # y_train: np.ndarray,
+                 y_train: np.ndarray,
                  y_test: np.ndarray,
                  threshold: float = 0.1,
                  input_size: int = 500,
@@ -37,20 +37,23 @@ def __init__(self,
 
         def select_authors(initial_x, initial_y):
             index = np.where(np.isin(initial_y, self.authors))[0]
-            new_x = [initial_x[i] for i in index]
+            new_x = initial_x[index]
             new_y = initial_y[index]
             return new_x, new_y
 
-        # simple_x_train, simple_y_train = select_authors(X_train, y_train)
+        X_test = torch.cat(X_test)
+        X_train = torch.cat(X_train)
+
+        simple_x_train, simple_y_train = select_authors(X_train, y_train)
         simple_x_test, simple_y_test = select_authors(X_test, y_test)
 
         self.data = {
             "simple": {
-                # "train": [simple_x_train, simple_y_train],
+                "train": [simple_x_train, simple_y_train],
                 "test": [simple_x_test, simple_y_test]
             },
             "full": {
-                # "train": [X_train, y_train],
+                "train": [X_train, y_train],
                 "test": [X_test, y_test]
             }
         }
@@ -67,7 +70,7 @@ def _plot_to_image(figure):
         buf.seek(0)
 
     def apply_dimensionality_reduction(self,
-                                       transformed_x: np.ndarray,
+                                       transformed_x,
                                        y: np.ndarray,
                                        epoch: int,
                                        is_test: bool):
@@ -100,8 +103,8 @@ def get_acc(self,
         knn = KNeighborsClassifier().fit(transformed_x, y)
         predictions = knn.predict(transformed_x)
         accuracy = accuracy_score(y_true=y, y_pred=predictions)
-        # if dim_red:
-        #     self.apply_dimensionality_reduction(transformed_x, y, epoch, is_test)
+        if dim_red:
+            self.apply_dimensionality_reduction(transformed_x, y, epoch, is_test)
         return accuracy
 
     def _writer(self,
@@ -120,10 +123,10 @@ def on_epoch_end(self,
                      epoch: int,
                      loss: float):
 
-        # astr = self._writer(*self.data["simple"]["train"], model, epoch, False, True)
+        astr = self._writer(*self.data["simple"]["train"], model, epoch, False, True)
         aste = self._writer(*self.data["simple"]["test"], model, epoch, True, True)
         afte = self._writer(*self.data["full"]["test"], model, epoch, True, False)
 
-        print(loss, aste, afte)
+        print(loss,astr, aste, afte)
         self.n += 1
-        return aste, afte
+        return astr, aste, afte
diff --git a/src/bert_attempts/BertBased.py b/src/bert_attempts/BertBased.py
index 533d816..1efad7a 100644
--- a/src/bert_attempts/BertBased.py
+++ b/src/bert_attempts/BertBased.py
@@ -85,6 +85,12 @@ def fillZeros(arr):
             new_xs = embedding_model(torch.from_numpy(xs)).last_hidden_state
             x_emb = [*x_emb, *new_xs]
 
+    x_train_emb = []
+    with torch.no_grad():
+        for i in tqdm.tqdm(range(0, X_train.shape[0], BATCH_SIZE)):
+            xs = X_train[i: i+BATCH_SIZE]
+            new_xs = embedding_model(torch.from_numpy(xs)).last_hidden_state
+            x_train_emb = [*x_train_emb, *new_xs]
     # save x_emb, x_train, y_test, y_train
 
     np.save('x_train.np', X_train)
@@ -95,8 +101,12 @@ def fillZeros(arr):
     for idx, tensor in enumerate(x_emb):
         torch.save(tensor, f"test_tensors/tensor{idx}.pt")
 
+    for idx, tensor in enumerate(x_train_emb):
+        torch.save(tensor, f"train_tensors/tensor{idx}.pt")
 
-# generate_data()
+
+
+generate_data()
 print('restoring')
 
 X_train = np.load('x_train.np.npy')
@@ -104,6 +114,7 @@ def fillZeros(arr):
 y_train = np.load('y_train.np.npy')
 X_test = np.load('x_test.np.npy')
 x_emb = [torch.load(f"test_tensors/tensor{idx}.pt") for idx in range(X_test.shape[0])]
+x_train_emb = [torch.load(f"train_tensors/tensor{idx}.pt") for idx in range(X_test.shape[0])]
 
 # -------------------------- model architecture
 
@@ -224,8 +235,9 @@ def init_weights(m):
 optimizer = optim.Adam(model.parameters(), lr=0.001)
 criterion = torch.jit.script(TripletLoss())
 
-callback = AccuracyEvaluator(x_emb, y_test, input_size=768)
+callback = AccuracyEvaluator(x_train_emb, x_emb, y_train, y_test, input_size=768)
 
+x_train_emb = torch.cat(x_train_emb)
 # training loop
 model.train()
 for epoch in tqdm.tqdm(range(N_EPOCHS), desc="Epochs"):
@@ -247,8 +259,9 @@ def init_weights(m):
         loss.backward()
         optimizer.step()
 
-        # predictions = model(x_emb)
-        # tree = BallTree(predictions, metric="euclidean")
+        with torch.no_grad():
+            predictions = model(x_train_emb)
+        tree = BallTree(predictions, metric="euclidean")
 
         current_loss = loss.cpu().detach().numpy()
         print(current_loss)

From fe83565bc735ca427e4883e90b4e9ae8035b3ed9 Mon Sep 17 00:00:00 2001
From: MefAldemisov <alinnamef@gmail.com>
Date: Fri, 11 Feb 2022 09:34:55 +0300
Subject: [PATCH 4/9] actual first training

---
 src/bert_attempts/AccuracyEvaluator.py |  9 ++---
 src/bert_attempts/BertBased.py         | 54 +++++++++++++-------------
 2 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/src/bert_attempts/AccuracyEvaluator.py b/src/bert_attempts/AccuracyEvaluator.py
index 376b237..e1ab84c 100644
--- a/src/bert_attempts/AccuracyEvaluator.py
+++ b/src/bert_attempts/AccuracyEvaluator.py
@@ -41,9 +41,6 @@ def select_authors(initial_x, initial_y):
             new_y = initial_y[index]
             return new_x, new_y
 
-        X_test = torch.cat(X_test)
-        X_train = torch.cat(X_train)
-
         simple_x_train, simple_y_train = select_authors(X_train, y_train)
         simple_x_test, simple_y_test = select_authors(X_test, y_test)
 
@@ -82,7 +79,7 @@ def apply_dimensionality_reduction(self,
             indexes = np.where(y == developer)[0]
             plt.plot(x_pca[indexes, 0], x_pca[indexes, 1], "o", ms=5)
         # save as file
-        plt.savefig("../outputs/tsne_{}/tsne_{}.png".format('bert', self.n))
+        plt.savefig("outputs/tsne_{}.png".format( self.n))
         # log to tensorboard
         # image = self._plot_to_image(figure)
         # writer = self.test_summary_writer if is_test else self.train_summary_writer
@@ -99,7 +96,7 @@ def get_acc(self,
                 is_test: bool,
                 dim_red: True) -> float:
         with torch.no_grad():
-            transformed_x = model(torch.cat(x))
+            transformed_x = model(x)
         knn = KNeighborsClassifier().fit(transformed_x, y)
         predictions = knn.predict(transformed_x)
         accuracy = accuracy_score(y_true=y, y_pred=predictions)
@@ -123,8 +120,8 @@ def on_epoch_end(self,
                      epoch: int,
                      loss: float):
 
-        astr = self._writer(*self.data["simple"]["train"], model, epoch, False, True)
         aste = self._writer(*self.data["simple"]["test"], model, epoch, True, True)
+        astr = self._writer(*self.data["simple"]["train"], model, epoch, False, True)
         afte = self._writer(*self.data["full"]["test"], model, epoch, True, False)
 
         print(loss,astr, aste, afte)
diff --git a/src/bert_attempts/BertBased.py b/src/bert_attempts/BertBased.py
index 1efad7a..ff08f14 100644
--- a/src/bert_attempts/BertBased.py
+++ b/src/bert_attempts/BertBased.py
@@ -22,7 +22,7 @@
 
 INPUT_SIZE = 512  # 514 tokens, maximum for bert
 OUTPUT_SIZE = 256
-N_EPOCHS = 100
+N_EPOCHS = 30
 BATCH_SIZE = 16
 
 # -------------------------- load data
@@ -85,6 +85,12 @@ def fillZeros(arr):
             new_xs = embedding_model(torch.from_numpy(xs)).last_hidden_state
             x_emb = [*x_emb, *new_xs]
 
+    np.save('x_train.np', X_train)
+    np.save('y_test.np', y_test)
+    np.save('y_train.np', y_train)
+    np.save('x_test.np', X_test)
+    torch.save(torch.cat(x_emb), 'test_tensor.pt')
+    print("main part saved")
     x_train_emb = []
     with torch.no_grad():
         for i in tqdm.tqdm(range(0, X_train.shape[0], BATCH_SIZE)):
@@ -93,28 +99,19 @@ def fillZeros(arr):
             x_train_emb = [*x_train_emb, *new_xs]
     # save x_emb, x_train, y_test, y_train
 
-    np.save('x_train.np', X_train)
-    np.save('y_test.np', y_test)
-    np.save('y_train.np', y_train)
-    np.save('x_test.np', X_test)
-
-    for idx, tensor in enumerate(x_emb):
-        torch.save(tensor, f"test_tensors/tensor{idx}.pt")
+    torch.save(torch.cat(x_train_emb), 'train_tensor.pt')
 
-    for idx, tensor in enumerate(x_train_emb):
-        torch.save(tensor, f"train_tensors/tensor{idx}.pt")
-
-
-
-generate_data()
+# generate_data()
 print('restoring')
 
 X_train = np.load('x_train.np.npy')
 y_test = np.load('y_test.np.npy')
 y_train = np.load('y_train.np.npy')
 X_test = np.load('x_test.np.npy')
-x_emb = [torch.load(f"test_tensors/tensor{idx}.pt") for idx in range(X_test.shape[0])]
-x_train_emb = [torch.load(f"train_tensors/tensor{idx}.pt") for idx in range(X_test.shape[0])]
+x_emb = torch.load('test_tensor.pt')
+x_train_emb = torch.load('train_tensor.pt')
+x_emb = torch.reshape(x_emb, (-1, 512, 768))
+x_train_emb = torch.reshape(x_train_emb, (-1, 512, 768))
 
 # -------------------------- model architecture
 
@@ -144,7 +141,8 @@ def batch_generator(self, model, tree):
         k = self.batch_size - positive_indexes.shape[0]
 
         if tree is not None:
-            query = model(self.x[anchor_index])
+            with torch.no_grad():
+                query = model(self.x[anchor_index])
             query_res = tree.query(query, self.batch_size+n_same, return_distance=False)[0]
             negative_indexes = np.array([neighbour_index for neighbour_index in query_res
                                          if self.y[neighbour_index] != y_anchor])[:k]
@@ -153,13 +151,13 @@ def batch_generator(self, model, tree):
             np.random.shuffle(negative_indexes)
             negative_indexes = negative_indexes[:k]
 
-        local_x = self.x.reshape((-1, INPUT_SIZE))
+        local_x = self.x.reshape((-1, INPUT_SIZE, 768))
 
         reduced_indexes = map(lambda indexes: np.random.choice(indexes, self.batch_size),
                               [positive_indexes, negative_indexes])
 
         positive, negative = map(lambda i: local_x[i], reduced_indexes)
-        anchor = np.array([local_x[anchor_index] for _ in range(self.batch_size)]).reshape((-1, INPUT_SIZE))
+        anchor = torch.concat([local_x[anchor_index] for _ in range(self.batch_size)])
 
         return anchor, positive, negative
 
@@ -180,12 +178,14 @@ def __init__(self):
         self.conv = nn.Sequential(
                 nn.Conv2d(1, self.channels, kernel_size=(k_size, 768),),
                 nn.ReLU(),
+                nn.Dropout(0.3)
             )
         #     for size in conv_sizes
         # ]
         self.fc = nn.Sequential(
             nn.Linear(self.pool_size*self.channels, INPUT_SIZE),
             nn.ReLU(),
+            nn.Dropout(0.3),
             nn.Linear(INPUT_SIZE, OUTPUT_SIZE),
             nn.ReLU()
         )
@@ -202,6 +202,8 @@ def forward(self, x):
 #
 
 # configs
+
+
 class TripletLoss(nn.Module):
     def __init__(self, margin=1.0):
         super(TripletLoss, self).__init__()
@@ -217,12 +219,13 @@ def forward(self, anchor: torch.Tensor, positive: torch.Tensor, negative: torch.
 
         return losses.mean()
 
+
 def init_weights(m):
     if isinstance(m, nn.Conv2d):
         torch.nn.init.xavier_normal_(m.weight)
 
 
-data_loader = GCJ(X_train, y_train)
+data_loader = GCJ(x_train_emb, y_train)
 
 embedding_model = RobertaModel.from_pretrained("microsoft/codebert-base")
 
@@ -234,20 +237,19 @@ def init_weights(m):
 
 optimizer = optim.Adam(model.parameters(), lr=0.001)
 criterion = torch.jit.script(TripletLoss())
-
+x_emb = x_emb[:X_test.shape[0]]
 callback = AccuracyEvaluator(x_train_emb, x_emb, y_train, y_test, input_size=768)
 
-x_train_emb = torch.cat(x_train_emb)
 # training loop
 model.train()
 for epoch in tqdm.tqdm(range(N_EPOCHS), desc="Epochs"):
     running_loss = []
     for step in enumerate(tqdm.tqdm(range(len(np.unique(y_train))), desc="Training", leave=False)):
         anchor, positive, negative = data_loader.batch_generator(model, tree)
-        with torch.no_grad():
-            anchor = embedding_model(torch.from_numpy(anchor)).last_hidden_state
-            positive = embedding_model(torch.from_numpy(positive)).last_hidden_state
-            negative = embedding_model(torch.from_numpy(negative)).last_hidden_state
+        # with torch.no_grad():
+        #     anchor = embedding_model(anchor).last_hidden_state
+        #     positive = embedding_model(positive).last_hidden_state
+        #     negative = embedding_model(negative).last_hidden_state
 
         optimizer.zero_grad()
 

From 7021f0dd937c3e7d69e4a20a2759c2e601ac8e9f Mon Sep 17 00:00:00 2001
From: MefAldemisov <alinnamef@gmail.com>
Date: Sun, 13 Feb 2022 13:43:24 +0300
Subject: [PATCH 5/9] Classes are moved to separate files

---
 .gitignore                         |   3 +
 src/bert_attempts/BertBased.py     | 225 +++--------------------------
 src/bert_attempts/DataGenerator.py |  75 ++++++++++
 src/bert_attempts/GCJ.py           |  45 ++++++
 src/bert_attempts/Network.py       |  36 +++++
 src/bert_attempts/README.md        |  19 +++
 src/bert_attempts/TripletLoss.py   |  24 +++
 7 files changed, 222 insertions(+), 205 deletions(-)
 create mode 100644 src/bert_attempts/DataGenerator.py
 create mode 100644 src/bert_attempts/GCJ.py
 create mode 100644 src/bert_attempts/Network.py
 create mode 100644 src/bert_attempts/README.md
 create mode 100644 src/bert_attempts/TripletLoss.py

diff --git a/.gitignore b/.gitignore
index 67a7af0..ab03f38 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,9 @@ myenv*
 embd/*
 embd
 *.csv
+*.npy
+*.pkl
+*pt
 *.h5
 *.png
 *.json
diff --git a/src/bert_attempts/BertBased.py b/src/bert_attempts/BertBased.py
index ff08f14..e1f3950 100644
--- a/src/bert_attempts/BertBased.py
+++ b/src/bert_attempts/BertBased.py
@@ -1,14 +1,18 @@
-from transformers import RobertaTokenizer, RobertaModel
 import tqdm
-import numpy as np
-import pandas as pd
 import torch
+
+import numpy as np
 import torch.nn as nn
 import torch.optim as optim
-from AccuracyEvaluator import AccuracyEvaluator
+
 from sklearn.neighbors import BallTree
-from sklearn.preprocessing import LabelEncoder
-# with reference to https://www.kaggle.com/hirotaka0122/triplet-loss-with-pytorch
+
+from AccuracyEvaluator import AccuracyEvaluator
+from GCJ import GCJ
+from Network import Network
+from TripletLoss import TripletLoss
+from DataGenerator import generate_data
+
 
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
@@ -17,92 +21,13 @@
 
 # -------------------------- constants
 df_path = '../../inputs/processed_dfs/cpp_9_tasks_2016.csv'
-tmp_dataset_dir = "../../inputs/preprocessed_jsons/"
-tmp_dataset_filename = tmp_dataset_dir + 'bert' + "_train.json"
 
 INPUT_SIZE = 512  # 514 tokens, maximum for bert
 OUTPUT_SIZE = 256
 N_EPOCHS = 30
 BATCH_SIZE = 16
 
-# -------------------------- load data
-
-
-def generate_data():
-    df = pd.read_csv(df_path)
-    # df = df.drop(columns=["round", "task", "solution", "file",
-    #                       "full_path", "Unnamed: 0.1", "Unnamed: 0", "lang"])
-    # df["n_lines"] = df.flines.apply(lambda x: str(x).count("\n"))
-    # df = df[(df.n_lines > 0)]
-
-
-    # def _insert_tokens(x: str):
-    #     x = x.replace("\n", " NLN ")
-    #     x = x.replace("\t", " TAB ")
-    #     x = x.replace(" ", " SPC ")
-    #     return x
-    #
-    # df.flines = df.flines.apply(_insert_tokens)
-
-    # load tokenizer
-    tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm")
-    df.index = np.arange(len(df))
-    le = LabelEncoder()
-    df.user = le.fit_transform(df.user)
-    df['tokens'] = df.flines.apply(lambda x:
-                                  tokenizer.convert_tokens_to_ids(
-                                      tokenizer.tokenize(x)))
-
-    dataset = df[["user", "tokens", "task"]]
-    # shuffle dataset
-    dataset = dataset.sample(frac=1)
-
-    X = dataset.tokens.values
-
-    def fillZeros(arr):
-        arr = np.array(arr)
-        if INPUT_SIZE > arr.shape[0]:
-            arr = np.pad(arr, (0, INPUT_SIZE - arr.shape[0]), 'constant')
-        else:
-            arr = arr[:INPUT_SIZE]
-        return arr.reshape(INPUT_SIZE, 1).tolist()
-
-    X = np.array([fillZeros(x) for x in X])
-    X = X.reshape((-1, INPUT_SIZE))
-    y = np.array(dataset.user)
-    tasks = np.array(dataset.task)
-    train_indexes = np.where(tasks < 7)[0]
-    test_indexes = np.where(tasks >= 7)[0]
-    X_train, X_test = X[train_indexes], X[test_indexes]
-    y_train, y_test = y[train_indexes], y[test_indexes] # 244 unique person
-
-    embedding_model = RobertaModel.from_pretrained("microsoft/codebert-base")
-
-    x_emb = []
-    with torch.no_grad():
-        for i in tqdm.tqdm(range(0, X_test.shape[0], BATCH_SIZE)):
-            xs = X_train[i: i+BATCH_SIZE]
-            new_xs = embedding_model(torch.from_numpy(xs)).last_hidden_state
-            x_emb = [*x_emb, *new_xs]
-
-    np.save('x_train.np', X_train)
-    np.save('y_test.np', y_test)
-    np.save('y_train.np', y_train)
-    np.save('x_test.np', X_test)
-    torch.save(torch.cat(x_emb), 'test_tensor.pt')
-    print("main part saved")
-    x_train_emb = []
-    with torch.no_grad():
-        for i in tqdm.tqdm(range(0, X_train.shape[0], BATCH_SIZE)):
-            xs = X_train[i: i+BATCH_SIZE]
-            new_xs = embedding_model(torch.from_numpy(xs)).last_hidden_state
-            x_train_emb = [*x_train_emb, *new_xs]
-    # save x_emb, x_train, y_test, y_train
-
-    torch.save(torch.cat(x_train_emb), 'train_tensor.pt')
-
-# generate_data()
-print('restoring')
+generate_data(df_path, INPUT_SIZE, OUTPUT_SIZE)
 
 X_train = np.load('x_train.np.npy')
 y_test = np.load('y_test.np.npy')
@@ -110,146 +35,36 @@ def fillZeros(arr):
 X_test = np.load('x_test.np.npy')
 x_emb = torch.load('test_tensor.pt')
 x_train_emb = torch.load('train_tensor.pt')
+# todo: remove reshaping (looks suspicious)
 x_emb = torch.reshape(x_emb, (-1, 512, 768))
 x_train_emb = torch.reshape(x_train_emb, (-1, 512, 768))
 
-# -------------------------- model architecture
-
-# let's do just a simple thing
-
-# 1. embedding from bert -> INPUT_SIZE * 768
-# 2. convolution (5*768)
-# 3. fully-connected 500
-# 4. fully connected 100
-
-# 1. pretrained part
-
-# train loader
-class GCJ:
-    def __init__(self, X_train, y_train, batch_size = BATCH_SIZE):
-        self.x = X_train
-        self.y = y_train
-        self.batch_size = batch_size
-
-    def batch_generator(self, model, tree):
-        n_positive = self.batch_size // 2
-        anchor_index = np.random.choice(self.y.shape[0], 1)
-        y_anchor = self.y[anchor_index]
-        positive_indexes = np.where(self.y == y_anchor)[0]
-        n_same = positive_indexes.shape[0]
-        positive_indexes = positive_indexes[:n_positive]
-        k = self.batch_size - positive_indexes.shape[0]
-
-        if tree is not None:
-            with torch.no_grad():
-                query = model(self.x[anchor_index])
-            query_res = tree.query(query, self.batch_size+n_same, return_distance=False)[0]
-            negative_indexes = np.array([neighbour_index for neighbour_index in query_res
-                                         if self.y[neighbour_index] != y_anchor])[:k]
-        else:  # the first batch generation
-            negative_indexes = np.where(self.y != y_anchor)[0]
-            np.random.shuffle(negative_indexes)
-            negative_indexes = negative_indexes[:k]
-
-        local_x = self.x.reshape((-1, INPUT_SIZE, 768))
-
-        reduced_indexes = map(lambda indexes: np.random.choice(indexes, self.batch_size),
-                              [positive_indexes, negative_indexes])
-
-        positive, negative = map(lambda i: local_x[i], reduced_indexes)
-        anchor = torch.concat([local_x[anchor_index] for _ in range(self.batch_size)])
-
-        return anchor, positive, negative
-
-    def generator(self, model, tree):
-            while True:
-                yield self.batch_generator(model, tree)
-
-
-# model
-
-class Network(nn.Module):
-    def __init__(self):
-        super(Network, self).__init__()
-        # conv_sizes = [2, 4, 16]
-        k_size = 8
-        self.pool_size = INPUT_SIZE - k_size + 1 # output for conv
-        self.channels = 4
-        self.conv = nn.Sequential(
-                nn.Conv2d(1, self.channels, kernel_size=(k_size, 768),),
-                nn.ReLU(),
-                nn.Dropout(0.3)
-            )
-        #     for size in conv_sizes
-        # ]
-        self.fc = nn.Sequential(
-            nn.Linear(self.pool_size*self.channels, INPUT_SIZE),
-            nn.ReLU(),
-            nn.Dropout(0.3),
-            nn.Linear(INPUT_SIZE, OUTPUT_SIZE),
-            nn.ReLU()
-        )
-
-    def forward(self, x):
-        # array = [conv(x) for conv in self.conv]
-        x = torch.reshape(x, (-1, 1, 512, 768))
-        x = self.conv(x)
-        x = x.view(-1, self.channels*self.pool_size)
-        # x = torch.concat(array, dim=1)
-        x = self.fc(x)
-        return x
-
-#
-
-# configs
-
-
-class TripletLoss(nn.Module):
-    def __init__(self, margin=1.0):
-        super(TripletLoss, self).__init__()
-        self.margin = margin
-
-    def calc_euclidean(self, x1, x2):
-        return (x1 - x2).pow(2).sum(1)
-
-    def forward(self, anchor: torch.Tensor, positive: torch.Tensor, negative: torch.Tensor) -> torch.Tensor:
-        distance_positive = self.calc_euclidean(anchor, positive)
-        distance_negative = self.calc_euclidean(anchor, negative)
-        losses = torch.relu(distance_positive - distance_negative + self.margin)
-
-        return losses.mean()
-
 
 def init_weights(m):
     if isinstance(m, nn.Conv2d):
         torch.nn.init.xavier_normal_(m.weight)
 
 
-data_loader = GCJ(x_train_emb, y_train)
-
-embedding_model = RobertaModel.from_pretrained("microsoft/codebert-base")
-
-model = Network()
+data_loader = GCJ(x_train_emb, y_train, BATCH_SIZE, INPUT_SIZE)
+model = Network(INPUT_SIZE, OUTPUT_SIZE)
 model.apply(init_weights)
 model = torch.jit.script(model).to(device)
 
-tree = None # default value
+tree = None  # default value
 
-optimizer = optim.Adam(model.parameters(), lr=0.001)
+optimizer = optim.Adam(model.parameters(), lr=0.01)
 criterion = torch.jit.script(TripletLoss())
+# todo: check, why
 x_emb = x_emb[:X_test.shape[0]]
 callback = AccuracyEvaluator(x_train_emb, x_emb, y_train, y_test, input_size=768)
 
 # training loop
 model.train()
+params = []
 for epoch in tqdm.tqdm(range(N_EPOCHS), desc="Epochs"):
     running_loss = []
     for step in enumerate(tqdm.tqdm(range(len(np.unique(y_train))), desc="Training", leave=False)):
         anchor, positive, negative = data_loader.batch_generator(model, tree)
-        # with torch.no_grad():
-        #     anchor = embedding_model(anchor).last_hidden_state
-        #     positive = embedding_model(positive).last_hidden_state
-        #     negative = embedding_model(negative).last_hidden_state
 
         optimizer.zero_grad()
 
@@ -266,11 +81,11 @@ def init_weights(m):
         tree = BallTree(predictions, metric="euclidean")
 
         current_loss = loss.cpu().detach().numpy()
-        print(current_loss)
         running_loss.append(current_loss)
 
         # callback (accuracy)
         metrics = callback.on_epoch_end(model, epoch, current_loss)
         print(metrics)
+        params.append(metrics)
 
-    print("Epoch: {}/{} - Loss: {:.4f}".format(epoch + 1, N_EPOCHS, np.mean(running_loss)))
\ No newline at end of file
+    print("Epoch: {}/{} - Loss: {:.4f}".format(epoch + 1, N_EPOCHS, np.mean(running_loss)))
diff --git a/src/bert_attempts/DataGenerator.py b/src/bert_attempts/DataGenerator.py
new file mode 100644
index 0000000..1d57d67
--- /dev/null
+++ b/src/bert_attempts/DataGenerator.py
@@ -0,0 +1,75 @@
+import torch
+import tqdm
+
+import pandas as pd
+import numpy as np
+
+from sklearn.preprocessing import LabelEncoder
+from transformers import RobertaTokenizer, RobertaModel
+
+
+def generate_data(df_path, INPUT_SIZE, BATCH_SIZE):
+    df = pd.read_csv(df_path)
+    # df = df.drop(columns=["round", "task", "solution", "file",
+    #                       "full_path", "Unnamed: 0.1", "Unnamed: 0", "lang"])
+    # df["n_lines"] = df.flines.apply(lambda x: str(x).count("\n"))
+    # df = df[(df.n_lines > 0)]
+
+    # def _insert_tokens(x: str):
+    #     x = x.replace("\n", " NLN ")
+    #     x = x.replace("\t", " TAB ")
+    #     x = x.replace(" ", " SPC ")
+    #     return x
+    #
+    # df.flines = df.flines.apply(_insert_tokens)
+
+    # load tokenizer
+    tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm")
+    df.index = np.arange(len(df))
+    le = LabelEncoder()
+    df.user = le.fit_transform(df.user)
+    df['tokens'] = df.flines.apply(lambda x: tokenizer
+                                   .convert_tokens_to_ids(tokenizer.tokenize(x)))
+
+    dataset = df[["user", "tokens", "task"]]
+    # shuffle dataset
+    dataset = dataset.sample(frac=1)
+
+    X = dataset.tokens.values
+
+    def fill_zeros(arr):
+        arr = np.array(arr)
+        if INPUT_SIZE > arr.shape[0]:
+            arr = np.pad(arr, (0, INPUT_SIZE - arr.shape[0]), 'constant')
+        else:
+            arr = arr[:INPUT_SIZE]
+        return arr.reshape(INPUT_SIZE, 1).tolist()
+
+    X = np.array([fill_zeros(x) for x in X])
+    X = X.reshape((-1, INPUT_SIZE))
+    y = np.array(dataset.user)
+    tasks = np.array(dataset.task)
+    train_indexes = np.where(tasks < 7)[0]
+    test_indexes = np.where(tasks >= 7)[0]
+    X_train, X_test = X[train_indexes], X[test_indexes]
+    y_train, y_test = y[train_indexes], y[test_indexes]
+
+    embedding_model = RobertaModel.from_pretrained("microsoft/codebert-base")
+
+    def get_embedding(data):
+        emb = []
+        with torch.no_grad():
+            for i in tqdm.tqdm(range(0, data.shape[0], BATCH_SIZE)):
+                batch = data[i: i+BATCH_SIZE]
+                new_part = embedding_model(torch.from_numpy(batch)).last_hidden_state
+                emb = [*emb, *new_part]
+        return emb
+
+    x_emb = get_embedding(X_test)
+    np.save('x_train.np', X_train)
+    np.save('y_test.np', y_test)
+    np.save('y_train.np', y_train)
+    np.save('x_test.np', X_test)
+    torch.save(torch.cat(x_emb), 'test_tensor.pt')
+    x_train_emb = get_embedding(X_train)
+    torch.save(torch.cat(x_train_emb), 'train_tensor.pt')
diff --git a/src/bert_attempts/GCJ.py b/src/bert_attempts/GCJ.py
new file mode 100644
index 0000000..b6c3de0
--- /dev/null
+++ b/src/bert_attempts/GCJ.py
@@ -0,0 +1,45 @@
+import torch
+import numpy as np
+
+'''
+The loader of the train data (batch generator)
+'''
+
+
+class GCJ:
+
+    def __init__(self, X_train, y_train, batch_size, input_size):
+        self.x = X_train
+        self.y = y_train
+        self.batch_size = batch_size
+        self.input_size = input_size
+
+    def batch_generator(self, model, tree):
+        n_positive = self.batch_size // 2
+        anchor_index = np.random.choice(self.y.shape[0], 1)
+        y_anchor = self.y[anchor_index]
+        positive_indexes = np.where(self.y == y_anchor)[0]
+        n_same = positive_indexes.shape[0]
+        positive_indexes = positive_indexes[:n_positive]
+        k = self.batch_size - positive_indexes.shape[0]
+
+        if tree is not None:
+            with torch.no_grad():
+                query = model(self.x[anchor_index])
+            query_res = tree.query(query, self.batch_size+n_same, return_distance=False)[0]
+            negative_indexes = np.array([neighbour_index for neighbour_index in query_res
+                                         if self.y[neighbour_index] != y_anchor])[:k]
+        else:  # the first batch generation
+            negative_indexes = np.where(self.y != y_anchor)[0]
+            np.random.shuffle(negative_indexes)
+            negative_indexes = negative_indexes[:k]
+
+        local_x = self.x.reshape((-1, self.input_size, 768))
+
+        reduced_indexes = map(lambda indexes: np.random.choice(indexes, self.batch_size),
+                              [positive_indexes, negative_indexes])
+
+        positive, negative = map(lambda i: local_x[i], reduced_indexes)
+        anchor = torch.concat([local_x[anchor_index] for _ in range(self.batch_size)])
+
+        return anchor, positive, negative
diff --git a/src/bert_attempts/Network.py b/src/bert_attempts/Network.py
new file mode 100644
index 0000000..2656e75
--- /dev/null
+++ b/src/bert_attempts/Network.py
@@ -0,0 +1,36 @@
+import torch
+from torch import nn
+
+
+class Network(nn.Module):
+    def __init__(self, input_size, output_size):
+        super(Network, self).__init__()
+
+        self.input_size = input_size
+        self.output_size = output_size
+        # conv_sizes = [2, 4, 16]
+        k_size = 8
+        self.pool_size = self.input_size - k_size + 1  # output for conv
+        self.channels = 4
+        self.conv = nn.Sequential(
+                nn.Conv2d(1, self.channels, kernel_size=(k_size, 768),),
+                nn.ReLU(),
+                nn.Dropout(0.3)
+            )
+        #     for size in conv_sizes
+        self.fc = nn.Sequential(
+            nn.Linear(self.pool_size*self.channels, self.input_size),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(self.input_size, self.output_size),
+            nn.ReLU()
+        )
+
+    def forward(self, x):
+        # array = [conv(x) for conv in self.conv]
+        x = torch.reshape(x, (-1, 1, self.input_size, 768))
+        x = self.conv(x)
+        x = x.view(-1, self.channels*self.pool_size)
+        # x = torch.concat(array, dim=1)
+        x = self.fc(x)
+        return x
diff --git a/src/bert_attempts/README.md b/src/bert_attempts/README.md
new file mode 100644
index 0000000..066dbb8
--- /dev/null
+++ b/src/bert_attempts/README.md
@@ -0,0 +1,19 @@
+# Bert - like attempts
+
+## Current tasks
+- [ ] add model saving 
+- [ ] add loss savings
+- [ ] limit the number of tests evaluations => speed-up the training
+- [x] split the file on separate classes
+- [ ] change model to one in `Embedding.py` (regularization, parallel evaluation)
+- [ ] debug
+
+## Execution:
+1. Start file - `BertBased.py`
+2. The line with data generation:
+
+```python
+generate_data(df_path, INPUT_SIZE, OUTPUT_SIZE)
+```
+
+It should be commented if the data is already generated (executes on CPU, takes ~1.5h)
\ No newline at end of file
diff --git a/src/bert_attempts/TripletLoss.py b/src/bert_attempts/TripletLoss.py
new file mode 100644
index 0000000..875959a
--- /dev/null
+++ b/src/bert_attempts/TripletLoss.py
@@ -0,0 +1,24 @@
+import torch
+from torch import nn
+
+'''
+with reference to https://www.kaggle.com/hirotaka0122/triplet-loss-with-pytorch
+'''
+
+
+class TripletLoss(nn.Module):
+
+    def __init__(self, margin=0.1):
+        super(TripletLoss, self).__init__()
+        self.margin = margin
+
+    @staticmethod
+    def calc_euclidean(x1, x2):
+        return (x1 - x2).pow(2).sum(1)
+
+    def forward(self, anchor: torch.Tensor, positive: torch.Tensor, negative: torch.Tensor) -> torch.Tensor:
+        distance_positive = self.calc_euclidean(anchor, positive)
+        distance_negative = self.calc_euclidean(anchor, negative)
+        # strainge idea to use ReLU instead of max
+        losses = torch.relu(distance_positive - distance_negative + self.margin)
+        return losses.mean()

From 35d5899fd97903578527393a79e774f8d449a1bd Mon Sep 17 00:00:00 2001
From: MefAldemisov <alinnamef@gmail.com>
Date: Sun, 13 Feb 2022 14:03:45 +0300
Subject: [PATCH 6/9] Model and data saving added, number of tests evaluations
 reduced

---
 src/bert_attempts/BertBased.py | 26 ++++++++++++++++----------
 src/bert_attempts/README.md    |  9 +++++----
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/src/bert_attempts/BertBased.py b/src/bert_attempts/BertBased.py
index e1f3950..e90f219 100644
--- a/src/bert_attempts/BertBased.py
+++ b/src/bert_attempts/BertBased.py
@@ -1,5 +1,6 @@
 import tqdm
 import torch
+import pickle
 
 import numpy as np
 import torch.nn as nn
@@ -63,7 +64,7 @@ def init_weights(m):
 params = []
 for epoch in tqdm.tqdm(range(N_EPOCHS), desc="Epochs"):
     running_loss = []
-    for step in enumerate(tqdm.tqdm(range(len(np.unique(y_train))), desc="Training", leave=False)):
+    for step in tqdm.tqdm(range(np.unique(y_train).shape[0]), desc="Training", leave=False):
         anchor, positive, negative = data_loader.batch_generator(model, tree)
 
         optimizer.zero_grad()
@@ -76,16 +77,21 @@ def init_weights(m):
         loss.backward()
         optimizer.step()
 
-        with torch.no_grad():
-            predictions = model(x_train_emb)
-        tree = BallTree(predictions, metric="euclidean")
+        if (step % 10 == 0):
+            with torch.no_grad():
+                predictions = model(x_train_emb)
+            tree = BallTree(predictions, metric="euclidean")
 
-        current_loss = loss.cpu().detach().numpy()
-        running_loss.append(current_loss)
+            current_loss = loss.cpu().detach().numpy()
+            running_loss.append(current_loss)
 
-        # callback (accuracy)
-        metrics = callback.on_epoch_end(model, epoch, current_loss)
-        print(metrics)
-        params.append(metrics)
+            # callback (accuracy)
+            metrics = callback.on_epoch_end(model, epoch, current_loss)
+            print(metrics)
+            params.append(metrics)
+            with open('training.pkl', 'wb'):
+                pickle.dump(params)
+
+        torch.save(model.state_dict(), 'model')
 
     print("Epoch: {}/{} - Loss: {:.4f}".format(epoch + 1, N_EPOCHS, np.mean(running_loss)))
diff --git a/src/bert_attempts/README.md b/src/bert_attempts/README.md
index 066dbb8..7a54f44 100644
--- a/src/bert_attempts/README.md
+++ b/src/bert_attempts/README.md
@@ -1,12 +1,13 @@
 # Bert - like attempts
 
 ## Current tasks
-- [ ] add model saving 
-- [ ] add loss savings
-- [ ] limit the number of tests evaluations => speed-up the training
+- [x] ! add model saving 
+- [x] ! add loss savings
+- [x] ! limit the number of tests evaluations => speed-up the training
 - [x] split the file on separate classes
-- [ ] change model to one in `Embedding.py` (regularization, parallel evaluation)
+- [ ] !! change model to one in `Embedding.py` (regularization, parallel evaluation)
 - [ ] debug
+- [ ] separate data from code
 
 ## Execution:
 1. Start file - `BertBased.py`

From 49cdedb3b05cec15b94bdefcbff7ea83a83fefbc Mon Sep 17 00:00:00 2001
From: MefAldemisov <alinnamef@gmail.com>
Date: Sun, 13 Feb 2022 15:26:59 +0300
Subject: [PATCH 7/9] Added paths, network arch changed

---
 src/bert_attempts/BertBased.py     | 46 ++++++++++++++++++++----------
 src/bert_attempts/DataGenerator.py | 14 ++++-----
 src/bert_attempts/Network.py       | 31 +++++++++++++-------
 src/bert_attempts/README.md        |  6 ++--
 4 files changed, 61 insertions(+), 36 deletions(-)

diff --git a/src/bert_attempts/BertBased.py b/src/bert_attempts/BertBased.py
index e90f219..f3906df 100644
--- a/src/bert_attempts/BertBased.py
+++ b/src/bert_attempts/BertBased.py
@@ -1,3 +1,4 @@
+import os
 import tqdm
 import torch
 import pickle
@@ -22,23 +23,21 @@
 
 # -------------------------- constants
 df_path = '../../inputs/processed_dfs/cpp_9_tasks_2016.csv'
+DATA_PATH = './data/'
+TRAIN_PATH = './train/'
 
 INPUT_SIZE = 512  # 514 tokens, maximum for bert
 OUTPUT_SIZE = 256
 N_EPOCHS = 30
 BATCH_SIZE = 16
 
-generate_data(df_path, INPUT_SIZE, OUTPUT_SIZE)
 
-X_train = np.load('x_train.np.npy')
-y_test = np.load('y_test.np.npy')
-y_train = np.load('y_train.np.npy')
-X_test = np.load('x_test.np.npy')
-x_emb = torch.load('test_tensor.pt')
-x_train_emb = torch.load('train_tensor.pt')
-# todo: remove reshaping (looks suspicious)
-x_emb = torch.reshape(x_emb, (-1, 512, 768))
-x_train_emb = torch.reshape(x_train_emb, (-1, 512, 768))
+def mkdir(dir_name):
+    # create dirs
+    try:
+        os.makedirs(dir_name)
+    except FileExistsError:
+        print('Dir exist')
 
 
 def init_weights(m):
@@ -46,11 +45,28 @@ def init_weights(m):
         torch.nn.init.xavier_normal_(m.weight)
 
 
-data_loader = GCJ(x_train_emb, y_train, BATCH_SIZE, INPUT_SIZE)
+mkdir(DATA_PATH)
+mkdir(TRAIN_PATH)
+
 model = Network(INPUT_SIZE, OUTPUT_SIZE)
 model.apply(init_weights)
 model = torch.jit.script(model).to(device)
 
+generate_data(df_path, DATA_PATH, INPUT_SIZE, BATCH_SIZE=64)
+
+X_train = np.load(DATA_PATH + 'x_train.np.npy')
+y_test = np.load(DATA_PATH + 'y_test.np.npy')
+y_train = np.load(DATA_PATH + 'y_train.np.npy')
+X_test = np.load(DATA_PATH + 'x_test.np.npy')
+x_emb = torch.load(DATA_PATH + 'test_tensor.pt')
+x_train_emb = torch.load(DATA_PATH + 'train_tensor.pt')
+x_emb = torch.reshape(x_emb, (-1, 512, 768))
+x_train_emb = torch.reshape(x_train_emb, (-1, 512, 768))
+
+
+data_loader = GCJ(x_train_emb, y_train, BATCH_SIZE, INPUT_SIZE)
+
+
 tree = None  # default value
 
 optimizer = optim.Adam(model.parameters(), lr=0.01)
@@ -77,7 +93,7 @@ def init_weights(m):
         loss.backward()
         optimizer.step()
 
-        if (step % 10 == 0):
+        if step % 10 == 0:
             with torch.no_grad():
                 predictions = model(x_train_emb)
             tree = BallTree(predictions, metric="euclidean")
@@ -89,9 +105,9 @@ def init_weights(m):
             metrics = callback.on_epoch_end(model, epoch, current_loss)
             print(metrics)
             params.append(metrics)
-            with open('training.pkl', 'wb'):
-                pickle.dump(params)
+            with open(TRAIN_PATH + 'training.pkl', 'wb') as f:
+                pickle.dump(params, f)
 
-        torch.save(model.state_dict(), 'model')
+        torch.save(model.state_dict(), TRAIN_PATH + 'model')
 
     print("Epoch: {}/{} - Loss: {:.4f}".format(epoch + 1, N_EPOCHS, np.mean(running_loss)))
diff --git a/src/bert_attempts/DataGenerator.py b/src/bert_attempts/DataGenerator.py
index 1d57d67..cba9d54 100644
--- a/src/bert_attempts/DataGenerator.py
+++ b/src/bert_attempts/DataGenerator.py
@@ -8,7 +8,7 @@
 from transformers import RobertaTokenizer, RobertaModel
 
 
-def generate_data(df_path, INPUT_SIZE, BATCH_SIZE):
+def generate_data(df_path: str, data_path: str, INPUT_SIZE: int, BATCH_SIZE: int):
     df = pd.read_csv(df_path)
     # df = df.drop(columns=["round", "task", "solution", "file",
     #                       "full_path", "Unnamed: 0.1", "Unnamed: 0", "lang"])
@@ -66,10 +66,10 @@ def get_embedding(data):
         return emb
 
     x_emb = get_embedding(X_test)
-    np.save('x_train.np', X_train)
-    np.save('y_test.np', y_test)
-    np.save('y_train.np', y_train)
-    np.save('x_test.np', X_test)
-    torch.save(torch.cat(x_emb), 'test_tensor.pt')
+    np.save(data_path + 'x_train.np', X_train)
+    np.save(data_path + 'y_test.np', y_test)
+    np.save(data_path + 'y_train.np', y_train)
+    np.save(data_path + 'x_test.np', X_test)
+    torch.save(torch.cat(x_emb), data_path + 'test_tensor.pt')
     x_train_emb = get_embedding(X_train)
-    torch.save(torch.cat(x_train_emb), 'train_tensor.pt')
+    torch.save(torch.cat(x_train_emb), data_path + 'train_tensor.pt')
diff --git a/src/bert_attempts/Network.py b/src/bert_attempts/Network.py
index 2656e75..f94c35a 100644
--- a/src/bert_attempts/Network.py
+++ b/src/bert_attempts/Network.py
@@ -8,17 +8,24 @@ def __init__(self, input_size, output_size):
 
         self.input_size = input_size
         self.output_size = output_size
-        # conv_sizes = [2, 4, 16]
-        k_size = 8
-        self.pool_size = self.input_size - k_size + 1  # output for conv
-        self.channels = 4
-        self.conv = nn.Sequential(
-                nn.Conv2d(1, self.channels, kernel_size=(k_size, 768),),
-                nn.ReLU(),
-                nn.Dropout(0.3)
-            )
-        #     for size in conv_sizes
+        self.conv_sizes = [2, 4, 16]
+        self.pool_size = sum([self.input_size - size + 1 for size in self.conv_sizes])  # output for conv
+        self.channels = 2
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(1, self.channels, kernel_size=(2, 768),),
+            nn.ReLU(),
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(1, self.channels, kernel_size=(4, 768),),
+            nn.ReLU(),
+        )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(1, self.channels, kernel_size=(16, 768),),
+            nn.ReLU(),
+        )
+
         self.fc = nn.Sequential(
+            nn.Dropout(0.5),
             nn.Linear(self.pool_size*self.channels, self.input_size),
             nn.ReLU(),
             nn.Dropout(0.3),
@@ -29,7 +36,9 @@ def __init__(self, input_size, output_size):
     def forward(self, x):
         # array = [conv(x) for conv in self.conv]
         x = torch.reshape(x, (-1, 1, self.input_size, 768))
-        x = self.conv(x)
+
+        # torch.view(-1, self.channels * self.input_size - size + 1)
+        x = torch.cat([self.conv1(x), self.conv2(x), self.conv3(x)])
         x = x.view(-1, self.channels*self.pool_size)
         # x = torch.concat(array, dim=1)
         x = self.fc(x)
diff --git a/src/bert_attempts/README.md b/src/bert_attempts/README.md
index 7a54f44..d86a573 100644
--- a/src/bert_attempts/README.md
+++ b/src/bert_attempts/README.md
@@ -5,9 +5,9 @@
 - [x] ! add loss savings
 - [x] ! limit the number of tests evaluations => speed-up the training
 - [x] split the file on separate classes
-- [ ] !! change model to one in `Embedding.py` (regularization, parallel evaluation)
-- [ ] debug
-- [ ] separate data from code
+- [x] !! change model to one in `Embedding.py` (regularization, parallel evaluation)
+- [x] debug
+- [x] separate data from code
 
 ## Execution:
 1. Start file - `BertBased.py`

From bf68ec2ddd46e5bb4a75dca1f79ba9117872ba3f Mon Sep 17 00:00:00 2001
From: MefAldemisov <alinnamef@gmail.com>
Date: Wed, 16 Feb 2022 00:06:26 +0300
Subject: [PATCH 8/9] Two not successful trainings

---
 src/bert_attempts/BertBased.py |  4 ++--
 src/bert_attempts/Network.py   | 20 +++++++++++++++-----
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/bert_attempts/BertBased.py b/src/bert_attempts/BertBased.py
index f3906df..d2b5252 100644
--- a/src/bert_attempts/BertBased.py
+++ b/src/bert_attempts/BertBased.py
@@ -52,7 +52,7 @@ def init_weights(m):
 model.apply(init_weights)
 model = torch.jit.script(model).to(device)
 
-generate_data(df_path, DATA_PATH, INPUT_SIZE, BATCH_SIZE=64)
+# generate_data(df_path, DATA_PATH, INPUT_SIZE, BATCH_SIZE=64)
 
 X_train = np.load(DATA_PATH + 'x_train.np.npy')
 y_test = np.load(DATA_PATH + 'y_test.np.npy')
@@ -69,7 +69,7 @@ def init_weights(m):
 
 tree = None  # default value
 
-optimizer = optim.Adam(model.parameters(), lr=0.01)
+optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.05)
 criterion = torch.jit.script(TripletLoss())
 # todo: check, why
 x_emb = x_emb[:X_test.shape[0]]
diff --git a/src/bert_attempts/Network.py b/src/bert_attempts/Network.py
index f94c35a..832e2ec 100644
--- a/src/bert_attempts/Network.py
+++ b/src/bert_attempts/Network.py
@@ -9,7 +9,7 @@ def __init__(self, input_size, output_size):
         self.input_size = input_size
         self.output_size = output_size
         self.conv_sizes = [2, 4, 16]
-        self.pool_size = sum([self.input_size - size + 1 for size in self.conv_sizes])  # output for conv
+        self.pool_size = [self.input_size - size + 1 for size in self.conv_sizes]  # outputs for convs
         self.channels = 2
         self.conv1 = nn.Sequential(
             nn.Conv2d(1, self.channels, kernel_size=(2, 768),),
@@ -25,10 +25,12 @@ def __init__(self, input_size, output_size):
         )
 
         self.fc = nn.Sequential(
+            nn.LayerNorm(sum(self.pool_size)*self.channels),
             nn.Dropout(0.5),
-            nn.Linear(self.pool_size*self.channels, self.input_size),
+            nn.Linear(sum(self.pool_size)*self.channels, self.input_size),
             nn.ReLU(),
-            nn.Dropout(0.3),
+            nn.LayerNorm(self.input_size),
+            nn.Dropout(0.5),
             nn.Linear(self.input_size, self.output_size),
             nn.ReLU()
         )
@@ -38,8 +40,16 @@ def forward(self, x):
         x = torch.reshape(x, (-1, 1, self.input_size, 768))
 
         # torch.view(-1, self.channels * self.input_size - size + 1)
-        x = torch.cat([self.conv1(x), self.conv2(x), self.conv3(x)])
-        x = x.view(-1, self.channels*self.pool_size)
+        x1 = self.conv1(x)
+        x2 = self.conv2(x)
+        x3 = self.conv3(x)
+
+        x1 = x1.view(-1, self.channels*self.pool_size[0])
+        x2 = x2.view(-1, self.channels*self.pool_size[1])
+        x3 = x3.view(-1, self.channels*self.pool_size[2])
+
+        x = torch.cat([x1, x2, x3], -1)
+        x = x.view(-1, self.channels*sum(self.pool_size))
         # x = torch.concat(array, dim=1)
         x = self.fc(x)
         return x

From e43f6b7aaa9306078802d11e290680c8788ff814 Mon Sep 17 00:00:00 2001
From: MefAldemisov <alinnamef@gmail.com>
Date: Wed, 27 Apr 2022 22:10:06 +0300
Subject: [PATCH 9/9] Configuration

---
 src/bert_attempts/BertBased.py                 | 8 ++++----
 src/main.py                                    | 6 +++---
 src/models/Embedding.py                        | 9 +++++----
 src/models/data_processing/TokenFeatures.py    | 9 ++++-----
 src/models/data_processing/base/DataLoading.py | 2 +-
 src/visualization/base/Visualizer.py           | 2 +-
 6 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/bert_attempts/BertBased.py b/src/bert_attempts/BertBased.py
index d2b5252..4de4ad9 100644
--- a/src/bert_attempts/BertBased.py
+++ b/src/bert_attempts/BertBased.py
@@ -69,7 +69,7 @@ def init_weights(m):
 
 tree = None  # default value
 
-optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.05)
+optimizer = optim.Adam(model.parameters(), lr=10**(-6), weight_decay=0.05)
 criterion = torch.jit.script(TripletLoss())
 # todo: check, why
 x_emb = x_emb[:X_test.shape[0]]
@@ -94,9 +94,9 @@ def init_weights(m):
         optimizer.step()
 
         if step % 10 == 0:
-            with torch.no_grad():
-                predictions = model(x_train_emb)
-            tree = BallTree(predictions, metric="euclidean")
+            #with torch.no_grad():
+            #    predictions = model(x_train_emb)
+            #tree = BallTree(predictions, metric="euclidean")
 
             current_loss = loss.cpu().detach().numpy()
             running_loss.append(current_loss)
diff --git a/src/main.py b/src/main.py
index 5dfd8d8..aea8cf4 100644
--- a/src/main.py
+++ b/src/main.py
@@ -2,7 +2,7 @@
 # from training.AvgTriplet import AverageTriplet
 from models.Embedding import Embedding
 # from models.Conv2D import Conv2D
-from visualization.VisualizerTokenFeatures import VisualizerTokenFeatures
+# from visualization.VisualizerTokenFeatures import VisualizerTokenFeatures
 # from visualization.VisualizerCharFeatures import VisualizerCharFeatures
 import tensorflow as tf
 
@@ -11,8 +11,8 @@
     for gpu in gpus:
         tf.config.experimental.set_memory_growth(gpu, True)
 
-model = Embedding(input_size=800, crop=200, output_size=50, make_initial_preprocess=True)
-SingleTriplet(model=model).train(batch_size=16, epochs=40, epoch_start=0, step_start=0)
+model = Embedding(input_size=800, crop=800, output_size=50, make_initial_preprocess=True)
+SingleTriplet(model=model).train(batch_size=16, epochs=50, epoch_start=0, step_start=0)
 # VisualizerTokenFeatures().run()
 
 # model = Conv2D()
diff --git a/src/models/Embedding.py b/src/models/Embedding.py
index b1a4643..dcde9de 100644
--- a/src/models/Embedding.py
+++ b/src/models/Embedding.py
@@ -28,9 +28,10 @@ def create_after_emb(self, reshape1,
                          conv_channels=2,
                          emb_height=100,
                          activation="relu",
-                         L2_lambda=0.02,
-                         conv_sizes=[2, 4, 16]):
+                         L2_lambda=0.05,
+                         conv_sizes=[2, 4, 8, 16]):
         # parallel piece
+        # d1 = layers.Dropout(0.3)(reshape1)
         convolutions = [layers.Conv2D(conv_channels, (conv_size, emb_height),
                                       name="conv2d_size_{}".format(conv_size),
                                       padding="same", activation=activation,
@@ -60,8 +61,8 @@ def create_after_emb(self, reshape1,
 
     def create_model(self,
                      activation: str = "relu",
-                     L2_lambda: float = 0.02,
-                     conv_sizes: List[int] = [2, 4, 16],
+                     L2_lambda: float = 0.05,
+                     conv_sizes: List[int] = [2, 4, 8, 16],
                      emb_height: int = 100):
 
         conv_channels = 2
diff --git a/src/models/data_processing/TokenFeatures.py b/src/models/data_processing/TokenFeatures.py
index 5b85a31..0113c2c 100644
--- a/src/models/data_processing/TokenFeatures.py
+++ b/src/models/data_processing/TokenFeatures.py
@@ -29,7 +29,7 @@ def __init__(self,
 
     @staticmethod
     def _write_vocab_file(filepath: str, vocab: List[str]):
-        with open(filepath, "w") as f:
+        with open(filepath, "w", encoding="utf-8") as f:
             for token in vocab:
                 print(token, file=f)
 
@@ -38,7 +38,7 @@ def _insert_tokens(x: str):
         x = x.replace("\n", " NLN ")
         x = x.replace("\t", " TAB ")
         x = x.replace(" ", " SPC ")
-        return x
+        return x.encode("utf-8")
 
     def initial_preprocess(self, df_path: str, tmp_dataset_filename: str):
         df = self._initial_load(df_path)
@@ -61,11 +61,11 @@ def initial_preprocess(self, df_path: str, tmp_dataset_filename: str):
         # reduce the size of the dataset according to the n_tokens
         df.index = np.arange(len(df))
         df["n_tokens"] = df.flines.apply(lambda x: tokenizer.tokenize(x).shape[0])
-        df = df[df.n_tokens <= self.input_size]
+        # df = df[df.n_tokens <= self.input_size]
         # reindex
         df.index = np.arange(len(df))
         # reduce size
-        df = self._user_selection_and_encoding(df, 50, 450)
+        df = self._user_selection_and_encoding(df, 0, 400)
         # long saving
         # The issue is that `tokenizer.tokenize()` do not always return a shape (-1, 1).
         # Some elements of the result of the function could be a list, e.g. [[2929, 8524]].
@@ -104,7 +104,6 @@ def secondary_preprocess(self, tmp_dataset_filename: str):
         test_indexes = np.where(tasks >= 7)[0]
         X_train, X_test = X[train_indexes], X[test_indexes]
         y_train, y_test = y[train_indexes], y[test_indexes]
-
         # X_train, y_train = self._crop_to(X_train, y_train, rs1=(-1, self.crop), rs2=(-1, self.crop, 1))
         # X_test, y_test = self._crop_to(X_test, y_test, rs1=(-1, self.crop), rs2=(-1, self.crop, 1))
         # self.input_size = self.crop
diff --git a/src/models/data_processing/base/DataLoading.py b/src/models/data_processing/base/DataLoading.py
index f9d574f..a5a59f0 100644
--- a/src/models/data_processing/base/DataLoading.py
+++ b/src/models/data_processing/base/DataLoading.py
@@ -87,7 +87,7 @@ def _crop_to(self,
         return new_X, new_y
 
     def preprocess(self,
-                   df_path: str = "../inputs/processed_dfs/cpp_9_tasks_2016.csv",
+                   df_path: str = "../inputs/processed_dfs/valid_py_9_tasks_2020.csv",
                    tmp_dataset_dir: str = "../inputs/preprocessed_jsons/") -> Tuple[np.ndarray, np.ndarray,
                                                                                     np.ndarray, np.ndarray]:
         """
diff --git a/src/visualization/base/Visualizer.py b/src/visualization/base/Visualizer.py
index 750b466..5833267 100644
--- a/src/visualization/base/Visualizer.py
+++ b/src/visualization/base/Visualizer.py
@@ -23,7 +23,7 @@ def __init__(self,
         self.model_name = model_name
         self.snippet_index = snippet_index
 
-        self.model = tf.keras.models.load_model('../outputs/{}_0.h'.format(model_name))
+        self.model = tf.keras.models.load_model('../outputs/{}_49.h'.format(model_name))
         all_x, _, all_y, _ = data_loader.secondary_preprocess("../inputs/preprocessed_jsons/{}_train.json"
                                                               .format(model_name))
         self.triplet_type = AverageTriplet(self.model)