Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ myenv*
embd/*
embd
*.csv
*.npy
*.pkl
*pt
*.h5
*.png
*.json
Expand Down
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,6 @@ tqdm==4.58.0
sentencepiece==0.1.95
pydot==1.4.2
tensorflow-text==2.5.0
torch==1.10.0
torchvision==0.11.1
transformers==4.15.0
129 changes: 129 additions & 0 deletions src/bert_attempts/AccuracyEvaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import io
import datetime
import numpy as np
import torch
import matplotlib.pyplot as plt

from typing import List
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

class AccuracyEvaluator:

def __init__(self,
X_train,
X_test,
y_train: np.ndarray,
y_test: np.ndarray,
threshold: float = 0.1,
input_size: int = 500,
authors: List = list(range(20))):
"""
Parameters:
- `X_train`,` X_test` - np.arrays with data (tokens)
- `y_train`, `y_test` - np.arrays, labels (numerical representation of authors)

- `threshold` - alpha parameter of the triplet loss, threshold for the classification's distance
- `input_size` - amount of tokens in one file
- `authors` - int, prediction stage requires the all-with-all comparison (O(n^2)),
that is why, it is reduced for plotting and evaluating
"""
super().__init__()
self.threshold = threshold
self.input_size = input_size
# x-y preprocessing
self.authors = authors

def select_authors(initial_x, initial_y):
index = np.where(np.isin(initial_y, self.authors))[0]
new_x = initial_x[index]
new_y = initial_y[index]
return new_x, new_y

simple_x_train, simple_y_train = select_authors(X_train, y_train)
simple_x_test, simple_y_test = select_authors(X_test, y_test)

self.data = {
"simple": {
"train": [simple_x_train, simple_y_train],
"test": [simple_x_test, simple_y_test]
},
"full": {
"train": [X_train, y_train],
"test": [X_test, y_test]
}
}

# counter initialization
self.n = 0

@staticmethod
def _plot_to_image(figure):
# https://www.tensorflow.org/tensorboard/image_summaries
buf = io.BytesIO()
plt.savefig(buf, format="png")
plt.close(figure)
buf.seek(0)

def apply_dimensionality_reduction(self,
transformed_x,
y: np.ndarray,
epoch: int,
is_test: bool):
vectors = TSNE(n_components=2)
x_pca = vectors.fit_transform(transformed_x)
figure = plt.figure(figsize=(10, 8))
plt.title("Step {} (epoch {})".format(self.n, epoch))
for developer in self.authors:
indexes = np.where(y == developer)[0]
plt.plot(x_pca[indexes, 0], x_pca[indexes, 1], "o", ms=5)
# save as file
plt.savefig("outputs/tsne_{}.png".format( self.n))
# log to tensorboard
# image = self._plot_to_image(figure)
# writer = self.test_summary_writer if is_test else self.train_summary_writer
# with writer.as_default():
# tf.summary.image("Distribution of authors", image, step=self.n)

plt.close("all")

def get_acc(self,
model,
x,
y: np.ndarray,
epoch: int,
is_test: bool,
dim_red: True) -> float:
with torch.no_grad():
transformed_x = model(x)
knn = KNeighborsClassifier().fit(transformed_x, y)
predictions = knn.predict(transformed_x)
accuracy = accuracy_score(y_true=y, y_pred=predictions)
if dim_red:
self.apply_dimensionality_reduction(transformed_x, y, epoch, is_test)
return accuracy

def _writer(self,
x,
y,
model,
epoch: int,
is_test: bool,
is_simple: bool) -> float:

accuracy = self.get_acc(model, x, y, epoch, is_test, is_simple)
return accuracy

def on_epoch_end(self,
model,
epoch: int,
loss: float):

aste = self._writer(*self.data["simple"]["test"], model, epoch, True, True)
astr = self._writer(*self.data["simple"]["train"], model, epoch, False, True)
afte = self._writer(*self.data["full"]["test"], model, epoch, True, False)

print(loss,astr, aste, afte)
self.n += 1
return astr, aste, afte
113 changes: 113 additions & 0 deletions src/bert_attempts/BertBased.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import os
import tqdm
import torch
import pickle

import numpy as np
import torch.nn as nn
import torch.optim as optim

from sklearn.neighbors import BallTree

from AccuracyEvaluator import AccuracyEvaluator
from GCJ import GCJ
from Network import Network
from TripletLoss import TripletLoss
from DataGenerator import generate_data


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if device.type == "cuda":
torch.cuda.get_device_name()

# -------------------------- constants
df_path = '../../inputs/processed_dfs/cpp_9_tasks_2016.csv'
DATA_PATH = './data/'
TRAIN_PATH = './train/'

INPUT_SIZE = 512 # 514 tokens, maximum for bert
OUTPUT_SIZE = 256
N_EPOCHS = 30
BATCH_SIZE = 16


def mkdir(dir_name):
# create dirs
try:
os.makedirs(dir_name)
except FileExistsError:
print('Dir exist')


def init_weights(m):
if isinstance(m, nn.Conv2d):
torch.nn.init.xavier_normal_(m.weight)


mkdir(DATA_PATH)
mkdir(TRAIN_PATH)

model = Network(INPUT_SIZE, OUTPUT_SIZE)
model.apply(init_weights)
model = torch.jit.script(model).to(device)

# generate_data(df_path, DATA_PATH, INPUT_SIZE, BATCH_SIZE=64)

X_train = np.load(DATA_PATH + 'x_train.np.npy')
y_test = np.load(DATA_PATH + 'y_test.np.npy')
y_train = np.load(DATA_PATH + 'y_train.np.npy')
X_test = np.load(DATA_PATH + 'x_test.np.npy')
x_emb = torch.load(DATA_PATH + 'test_tensor.pt')
x_train_emb = torch.load(DATA_PATH + 'train_tensor.pt')
x_emb = torch.reshape(x_emb, (-1, 512, 768))
x_train_emb = torch.reshape(x_train_emb, (-1, 512, 768))


data_loader = GCJ(x_train_emb, y_train, BATCH_SIZE, INPUT_SIZE)


tree = None # default value

optimizer = optim.Adam(model.parameters(), lr=10**(-6), weight_decay=0.05)
criterion = torch.jit.script(TripletLoss())
# todo: check, why
x_emb = x_emb[:X_test.shape[0]]
callback = AccuracyEvaluator(x_train_emb, x_emb, y_train, y_test, input_size=768)

# training loop
model.train()
params = []
for epoch in tqdm.tqdm(range(N_EPOCHS), desc="Epochs"):
running_loss = []
for step in tqdm.tqdm(range(np.unique(y_train).shape[0]), desc="Training", leave=False):
anchor, positive, negative = data_loader.batch_generator(model, tree)

optimizer.zero_grad()

anchor_out = model(anchor)
positive_out = model(positive)
negative_out = model(negative)

loss = criterion(anchor_out, positive_out, negative_out)
loss.backward()
optimizer.step()

if step % 10 == 0:
#with torch.no_grad():
# predictions = model(x_train_emb)
#tree = BallTree(predictions, metric="euclidean")

current_loss = loss.cpu().detach().numpy()
running_loss.append(current_loss)

# callback (accuracy)
metrics = callback.on_epoch_end(model, epoch, current_loss)
print(metrics)
params.append(metrics)
with open(TRAIN_PATH + 'training.pkl', 'wb') as f:
pickle.dump(params, f)

torch.save(model.state_dict(), TRAIN_PATH + 'model')

print("Epoch: {}/{} - Loss: {:.4f}".format(epoch + 1, N_EPOCHS, np.mean(running_loss)))
75 changes: 75 additions & 0 deletions src/bert_attempts/DataGenerator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import torch
import tqdm

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from transformers import RobertaTokenizer, RobertaModel


def generate_data(df_path: str, data_path: str, INPUT_SIZE: int, BATCH_SIZE: int):
df = pd.read_csv(df_path)
# df = df.drop(columns=["round", "task", "solution", "file",
# "full_path", "Unnamed: 0.1", "Unnamed: 0", "lang"])
# df["n_lines"] = df.flines.apply(lambda x: str(x).count("\n"))
# df = df[(df.n_lines > 0)]

# def _insert_tokens(x: str):
# x = x.replace("\n", " NLN ")
# x = x.replace("\t", " TAB ")
# x = x.replace(" ", " SPC ")
# return x
#
# df.flines = df.flines.apply(_insert_tokens)

# load tokenizer
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm")
df.index = np.arange(len(df))
le = LabelEncoder()
df.user = le.fit_transform(df.user)
df['tokens'] = df.flines.apply(lambda x: tokenizer
.convert_tokens_to_ids(tokenizer.tokenize(x)))

dataset = df[["user", "tokens", "task"]]
# shuffle dataset
dataset = dataset.sample(frac=1)

X = dataset.tokens.values

def fill_zeros(arr):
arr = np.array(arr)
if INPUT_SIZE > arr.shape[0]:
arr = np.pad(arr, (0, INPUT_SIZE - arr.shape[0]), 'constant')
else:
arr = arr[:INPUT_SIZE]
return arr.reshape(INPUT_SIZE, 1).tolist()

X = np.array([fill_zeros(x) for x in X])
X = X.reshape((-1, INPUT_SIZE))
y = np.array(dataset.user)
tasks = np.array(dataset.task)
train_indexes = np.where(tasks < 7)[0]
test_indexes = np.where(tasks >= 7)[0]
X_train, X_test = X[train_indexes], X[test_indexes]
y_train, y_test = y[train_indexes], y[test_indexes]

embedding_model = RobertaModel.from_pretrained("microsoft/codebert-base")

def get_embedding(data):
emb = []
with torch.no_grad():
for i in tqdm.tqdm(range(0, data.shape[0], BATCH_SIZE)):
batch = data[i: i+BATCH_SIZE]
new_part = embedding_model(torch.from_numpy(batch)).last_hidden_state
emb = [*emb, *new_part]
return emb

x_emb = get_embedding(X_test)
np.save(data_path + 'x_train.np', X_train)
np.save(data_path + 'y_test.np', y_test)
np.save(data_path + 'y_train.np', y_train)
np.save(data_path + 'x_test.np', X_test)
torch.save(torch.cat(x_emb), data_path + 'test_tensor.pt')
x_train_emb = get_embedding(X_train)
torch.save(torch.cat(x_train_emb), data_path + 'train_tensor.pt')
45 changes: 45 additions & 0 deletions src/bert_attempts/GCJ.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import torch
import numpy as np

'''
The loader of the train data (batch generator)
'''


class GCJ:

def __init__(self, X_train, y_train, batch_size, input_size):
self.x = X_train
self.y = y_train
self.batch_size = batch_size
self.input_size = input_size

def batch_generator(self, model, tree):
n_positive = self.batch_size // 2
anchor_index = np.random.choice(self.y.shape[0], 1)
y_anchor = self.y[anchor_index]
positive_indexes = np.where(self.y == y_anchor)[0]
n_same = positive_indexes.shape[0]
positive_indexes = positive_indexes[:n_positive]
k = self.batch_size - positive_indexes.shape[0]

if tree is not None:
with torch.no_grad():
query = model(self.x[anchor_index])
query_res = tree.query(query, self.batch_size+n_same, return_distance=False)[0]
negative_indexes = np.array([neighbour_index for neighbour_index in query_res
if self.y[neighbour_index] != y_anchor])[:k]
else: # the first batch generation
negative_indexes = np.where(self.y != y_anchor)[0]
np.random.shuffle(negative_indexes)
negative_indexes = negative_indexes[:k]

local_x = self.x.reshape((-1, self.input_size, 768))

reduced_indexes = map(lambda indexes: np.random.choice(indexes, self.batch_size),
[positive_indexes, negative_indexes])

positive, negative = map(lambda i: local_x[i], reduced_indexes)
anchor = torch.concat([local_x[anchor_index] for _ in range(self.batch_size)])

return anchor, positive, negative
Loading