diff --git a/.gitignore b/.gitignore index 39aad2ea..41f81f29 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,15 @@ __pycache__ .DS_Store *.pkl *.tsv +*.json +*.json.bz2 +*.csv +90*1000*.png +examples/scaa +examples/task +examples/new_scaa +examples/one_to_one +examples/one_vs_10 lda_docstring_embeddings* .idea SourceCodeTools.egg-info diff --git a/Analysis of embeddings.ipynb b/Analysis of embeddings.ipynb new file mode 100644 index 00000000..b8ba11f8 --- /dev/null +++ b/Analysis of embeddings.ipynb @@ -0,0 +1,551 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "befdf471-3e2e-45e3-b238-8b69dfe0148f", + "metadata": {}, + "outputs": [], + "source": [ + "from SourceCodeTools.code.data.dataset.Dataset import SourceGraphDataset, read_or_create_gnn_dataset\n", + "from SourceCodeTools.models.graph.train.sampling_multitask2 import SamplingMultitaskTrainer\n", + "from random import random\n", + "\n", + "from SourceCodeTools.models.training_config import get_config, save_config, load_config\n", + "from SourceCodeTools.code.data.dataset.Dataset import SourceGraphDataset, filter_dst_by_freq\n", + "from SourceCodeTools.models.graph.train.sampling_multitask2 import training_procedure, SamplingMultitaskTrainer\n", + "from SourceCodeTools.models.graph.train.objectives.NodeClassificationObjective import NodeClassifierObjective\n", + "from SourceCodeTools.models.graph.train.objectives.SubgraphClassifierObjective import SubgraphAbstractObjective, \\\n", + " SubgraphClassifierObjective, SubgraphEmbeddingObjective\n", + "from SourceCodeTools.models.graph.train.objectives.SubgraphEmbedderObjective import SubgraphEmbeddingObjective, \\\n", + " SubgraphMatchingObjective\n", + "from SourceCodeTools.models.graph.train.utils import get_name, get_model_base\n", + "from SourceCodeTools.models.graph import RGGAN\n", + "from SourceCodeTools.tabular.common import compact_property\n", + "from SourceCodeTools.code.data.file_utils import unpersist\n", + "from SourceCodeTools.models.graph.train.objectives.SCAAClassifierObjetive import SCAAClassifierObjective\n", + "\n", + "\n", + "import dgl\n", + "import torch\n", + "import numpy as np\n", + "from argparse import Namespace\n", + "from torch import nn\n", + "from datetime import datetime\n", + "from os.path import join\n", + "from functools import partial" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "64226cdb-98dc-4b5d-84eb-2453428d4fee", + "metadata": {}, + "outputs": [], + "source": [ + "data_path = \"./examples/one_vs_10/with_ast\"\n", + "subgraph_partition = join(data_path, \"subgraph_partition.json.bz2\")\n", + "tokenizer_path = \"sentencepiece_bpe.model\"\n", + "\n", + "config = get_config(\n", + " subgraph_id_column=\"file_id\",\n", + " data_path=data_path,\n", + " model_output_dir=data_path,\n", + " subgraph_partition=subgraph_partition,\n", + " tokenizer_path=tokenizer_path,\n", + " objectives=\"scaa_match\",\n", + " measure_scores=True,\n", + " dilate_scores=1,\n", + " epochs=1000,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f8591242-be94-41a0-ba65-0caa9aaf74ca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0m\u001b[01;31mcommon_edges.json.bz2\u001b[0m \u001b[01;31mnode_names.json.bz2\u001b[0m\n", + "\u001b[01;31mcommon_filecontent.json.bz2\u001b[0m \u001b[01;31mRGGAN_100.zip\u001b[0m\n", + "\u001b[01;31mcommon_nodes.json.bz2\u001b[0m \u001b[01;34mRGGAN-2022-05-12-20-00-44-925831\u001b[0m/\n", + "\u001b[01;31mcommon_offsets.json.bz2\u001b[0m \u001b[01;34mRGGAN-2022-05-17-20-48-59-920596\u001b[0m/\n", + "\u001b[01;31mfull_scaa.zip\u001b[0m \u001b[01;31msubgraph_partition.json.bz2\u001b[0m\n" + ] + } + ], + "source": [ + "! ls ./examples/new_scaa/with_ast" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e902a433-0da4-4700-80f0-59c8e6bcc262", + "metadata": {}, + "outputs": [], + "source": [ + "base_path = data_path + '/u-pooling 100 epochs exp 3/'" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "da2cc3a0-0b52-46ad-b4ad-57c1f4d54b8f", + "metadata": {}, + "outputs": [], + "source": [ + "# dataset = read_or_create_gnn_dataset(\n", + "# args={**config[\"DATASET\"], **config[\"TOKENIZER\"]},\n", + "# model_base=base_path, restore_state=True\n", + "# )" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2ade8b5d-7261-480c-b832-8a476e9c6740", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/alina/workspace/scaa/method-embedding/SourceCodeTools/code/data/dataset/Dataset.py:131: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n", + " self.nodes = self.nodes.append(new_nodes, ignore_index=True)\n", + "/home/alina/workspace/scaa/method-embedding/SourceCodeTools/code/data/dataset/Dataset.py:132: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n", + " self.edges = self.edges.append(new_edges, ignore_index=True)\n" + ] + } + ], + "source": [ + "dataset = SourceGraphDataset(\n", + " **{**config[\"DATASET\"], **config[\"TOKENIZER\"]}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6cfd15ee-913c-4bb8-94ba-f5c76122b43a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'DATASET': {'data_path': './examples/one_vs_10/with_ast',\n", + " 'train_frac': 0.9,\n", + " 'filter_edges': None,\n", + " 'min_count_for_objectives': 5,\n", + " 'self_loops': False,\n", + " 'use_node_types': False,\n", + " 'use_edge_types': False,\n", + " 'no_global_edges': False,\n", + " 'remove_reverse': False,\n", + " 'custom_reverse': None,\n", + " 'restricted_id_pool': None,\n", + " 'random_seed': None,\n", + " 'subgraph_id_column': 'file_id',\n", + " 'subgraph_partition': './examples/one_vs_10/with_ast/subgraph_partition.json.bz2'},\n", + " 'TRAINING': {'model_output_dir': './examples/one_vs_10/with_ast',\n", + " 'pretrained': None,\n", + " 'pretraining_phase': 0,\n", + " 'sampling_neighbourhood_size': 10,\n", + " 'neg_sampling_factor': 3,\n", + " 'use_layer_scheduling': False,\n", + " 'schedule_layers_every': 10,\n", + " 'elem_emb_size': 100,\n", + " 'embedding_table_size': 200000,\n", + " 'epochs': 1000,\n", + " 'batch_size': 128,\n", + " 'learning_rate': 0.001,\n", + " 'objectives': 'scaa_match',\n", + " 'save_each_epoch': False,\n", + " 'save_checkpoints': True,\n", + " 'early_stopping': False,\n", + " 'early_stopping_tolerance': 20,\n", + " 'force_w2v_ns': False,\n", + " 'use_ns_groups': False,\n", + " 'nn_index': 'brute',\n", + " 'metric': 'inner_prod',\n", + " 'measure_scores': True,\n", + " 'dilate_scores': 1,\n", + " 'gpu': -1,\n", + " 'external_dataset': None,\n", + " 'restore_state': False},\n", + " 'MODEL': {'node_emb_size': 100,\n", + " 'h_dim': 100,\n", + " 'n_layers': 5,\n", + " 'use_self_loop': True,\n", + " 'use_gcn_checkpoint': False,\n", + " 'use_att_checkpoint': False,\n", + " 'use_gru_checkpoint': False,\n", + " 'num_bases': 10,\n", + " 'dropout': 0.0,\n", + " 'activation': 'tanh'},\n", + " 'TOKENIZER': {'tokenizer_path': 'sentencepiece_bpe.model'}}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "config" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7410d7b5-79cb-4acc-aa23-9029874de594", + "metadata": {}, + "outputs": [], + "source": [ + "config['TRAINING']['model_base_path'] = base_path\n", + "config['TRAINING']['batch_size'] = 64\n", + "\n", + "config['MODEL'][\"activation\"] = eval(f\"nn.functional.{config['MODEL']['activation']}\")\n", + "# config['TRAINING']['objectives'] = ['subgraph_match']" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2ba6475d-8428-4fa4-8792-92be62afe360", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of nodes 17892\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-06-07 11:04:58.847780: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n", + "2022-06-07 11:04:58.847811: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n" + ] + } + ], + "source": [ + "model = SamplingMultitaskTrainer(dataset=dataset, \n", + " model_name=RGGAN,\n", + " model_params=config[\"MODEL\"],\n", + " trainer_params=config[\"TRAINING\"], \n", + " restore=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "493431d2-72dc-4636-988b-10c2900989d2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "keys = list(model.objectives[0].subgraph_mapping.keys())\n", + "print(keys[-1] is pd.NA)\n", + "len(model.objectives[0].subgraph_mapping[keys[-1]])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "bf394f00-3f3f-411f-a7b7-469065cc8975", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Computer Subgraph Embeddings: 0%| | 0/2 [00:00= 7].id.values\n", + "\n", + "train_indexes = np.where([n in train_ids for n in ids])[0]\n", + "test_indexes = np.where([n not in train_ids for n in ids])[0]\n", + "x_train = emb[train_indexes]\n", + "x_test = emb[test_indexes]\n", + "\n", + "y_train = [dataset[dataset.id == id].user.values[0] for id in train_ids]\n", + "y_test = [dataset[dataset.id == id].user.values[0] for id in test_ids]\n", + "\n", + "print(x_train.shape, x_test.shape, len(y_train), len(y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "a86fb45b-482f-46d3-904c-dbb55aa29018", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "test acc 0.15\n", + "train acc 0.2571428571428571\n" + ] + } + ], + "source": [ + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "knn = KNeighborsClassifier(n_neighbors=5)\n", + "knn.fit(x_train, y_train)\n", + "y_pred = knn.predict(x_test)\n", + "print('test acc', accuracy_score(y_test, y_pred))\n", + "y_pred = knn.predict(x_train)\n", + "print('train acc', accuracy_score(y_train, y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "062caf27-e129-4c5c-948c-cccaee231f3b", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAeIAAAHSCAYAAAAwk8gOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAABAj0lEQVR4nO3dfVxU5503/s/FMDAEcYgiASQpQn1KZKqGJjFZNjXTn8RlCam7dW3T9d70t3XTh1/QV8XEaCy1d+64Ma2S2za/27bp1t62rk2t7JQ2mE5IS9WYm4jBxMfCmg0wLKLLIIQZZuC6/xhnwujwMDNnzpmHz/v1ygv5MpzzJSIfrutc51xCSgkiIiLSRpLWDRARESUyBjEREZGGGMREREQaYhATERFpiEFMRESkIQYxERGRhpK1OGlWVpYsKCjQ4tRERESqe+edd3qllLMCfUyTIC4oKEBzc7MWpyYiIlKdEOKD8T7GqWkiIiINMYiJiIg0xCAmIiLSkCbXiImIKP64XC50dHTA4XBo3YpmDAYD8vPzodfrp/w5DGIiIlJER0cHMjIyUFBQACGE1u2oTkqJK1euoKOjA3PmzJny53FqmoiIFOFwODBz5syEDGEAEEJg5syZQc8IMIiJiEgxiRrCXqF8/QxiIiKKC319ffjBD34Q8ucXFBSgt7d3wte88847KC4uxic/+Uk8+eSTkFKGfD4vBjEREWnicEsnHtjxBuY8XY8HdryBwy2dYR0v3CCeiq9+9av44Q9/iIsXL+LixYt47bXXwj4mg5iIiFR3uKUTmw+dRmffECSAzr4hbD50Oqwwfvrpp9HW1obFixejuroaAwMDMJvNWLp0KYqLi1FXVwcAGBwcRHl5OT71qU9h0aJF+Nd//Ve/4wwNDWHlypX44Q9/6Fe32Wzo7+/HfffdByEE1q5di8OHD4fcrxdXTRMRkep2NpzHkGvErzbkGsHOhvN4dMnskI65Y8cOvPfeezh16hQAwO1249e//jWmT5+O3t5e3HfffXjkkUfw2muvIS8vD/X19QAAu93uO8bAwADWrFmDtWvXYu3atX7H7+zsRH5+vu/9/Px8dHaGN4oHOCImIiINdPUNBVUPhZQSzzzzDEwmEz772c+is7MT//mf/4ni4mK8/vrreOqpp9DU1ASj0ej7nMrKSjz++OM3hXAkMYiJiEh1eZlpQdVDsX//fly+fBnvvPMOTp06hdtuuw0OhwPz5s3DyZMnUVxcjK1bt2L79u2+z3nggQfw2muvBVyENXv2bHR0dPje7+jowOzZoY3exwo7iIUQtwshGoUQZ4QQ7wshqsLuioiI4lp12Xyk6XV+tTS9DtVl80M+ZkZGBq5du+Z73263Izs7G3q9Ho2NjfjgA88GSF1dXbjlllvwpS99CdXV1Th58qTvc7Zv345bb70VX//61286fm5uLqZPn4633noLUkrs27cPlZWVIffrpcSI2A3gm1LKOwHcB+DrQog7FTguERHFqUeXzMbzq4oxOzMNAsDszDQ8v6o45OvDADBz5kw88MADWLRoEaqrq/HYY4+hubkZxcXF2LdvHxYsWAAAOH36NO655x4sXrwY3/72t7F161a/49TW1mJoaAibNm266Rw/+MEP8I//+I/45Cc/iaKiIqxcuTLkfr2EEvdA+R1QiDoAe6SUr4/3mpKSEhmP+xEfbunEzobz6OobQl5mGqrL5of1TUVEFEvOnj2LhQsXat2G5gL9fxBCvCOlLAn0ekVXTQshCgAsAXAiwMfWAVgHAHfccYeSp1Ve60HAuh2wdwDGfMC8DTCtnvBTvEvxvasAvUvxATCMiYhoXIot1hJCTAPwKwDrpZT9N35cSrlXSlkipSyZNWuWUqdVXutBwPIkYP8QgPS8tTzpqU9goqX4RERE41EkiIUQenhCeL+U8pASx9SMdTvgumH5vGvIU5+AGkvxiYgo/iixaloA+DGAs1LK74XfksbsHcHVr1NjKT4REcUfJUbEDwD4ewAPCSFOXf/vrxQ4rjaM+cHVr4vEUnwiIop/YS/WklL+CUD87Htl3ua5Jjx2elqf5qlPwLsgi6umiYgoGHzW9I28q6ODXDUNeMKYwUtEpI2+vj78/Oc/x9e+9rWQPr+goADNzc3Iysoa9zVbtmzBvn378F//9V8YGBgItVU/fMRlIKbVwIb3gJo+z9sphDAREQWp9SCwaxFQk+l5O8ndKZNRYxvEiooKvP3224oek0FMRETqC/FW0YlEehtEALjvvvuQm5sbco+BcGqaiIjUN9GtoiHOQkZ6G8RI4YiYiIjUF+KtosHgNohERETjCfFW0WAovQ1ipDCIiYhIfeZtnltDx5rCraITifQ2iJHCIFbBYEsPbDveRsfTTbDteBuDLT1at0REpC3TaqDiJcB4OwDheVvxUlh3qaixDeKmTZuQn5+Pjz76CPn5+aipqQm5Xy/Ft0GcinjdBjGQwZYe9B26COka9dWEPgmZq+YifUm2hp0RESmL2yB6BLsNIkfEEdbfcMkvhAFAukbR33BJm4aIiCiqMIgjbKTPGVSdiIgSC4M4wnSZqUHViYgosTCII2x6WQGE3v9/s9AnYXpZgTYNERFRVOGTtSLMuyCrv+ESRvqc0GWmYnpZARdqERERAAaxKtKXZKNN1w2r9SjsdjuMbxph1plhMpm0bo2IiDTGqWkVtLa2wmKx+J5narfbYbFY0NraqnFnRETxI9zdlwoKCtDb2zvuxz/66COUl5djwYIFuOuuu/D000+HfK6x4j6I7RYLLj5kxtmFd+LiQ2bYLRbVe7BarXC5XH41l8sFq9V602vPNjVi79cfx3fXVGDv1x/H2aZGtdokIlJVfXs9Vry6AqafmrDi1RWob68P63hqbIO4ceNGnDt3Di0tLTh69Ch+97vfhX3MuA5iu8UC27Pb4O7qAqSEu6sLtme3+cJYrdAbu7PHRPWzTY04sncPrvVeBqTEtd7LOLJ3D8OYiOJOfXs9ao7VwDZog4SEbdCGmmM1YYVxpLdBvOWWW7B8+XIAQEpKCpYuXYqOjvA3qYjra8Q9u3ZDOhx+NelwoGfXbnRlTsORvXvgHvbcz+sNPQBYWLpc0T6MRmPAMB674wcANB3Y5+vHyz3sRNOBfYr3RESkpdqTtXCM+P98dow4UHuyFuWF5SEdU81tEPv6+mCxWFBVVRVSr2PF9YjYbbONW58o9JRmNpuh1+v9anq9Hmaz2a927UrgaxPj1YmIYlX3YHdQ9VBEahtEt9uNL3zhC3jyySdRWFgYdp9xHcTJubnj1tUMPZPJhIqKCt9fttFoREVFxU2rpjNmZgX8/PHqRESxKic9J6h6KCK1DeK6deswd+5crF+/XpE+4zqIszeshzAY/GrCYED2hvWqh57JZMKGDRtQU1ODDRs2BLx1qXTNWiSn+D9xKzklFaVr1NugmohIDVVLq2DQ+f98NugMqFoa+lSvGtsgbt26FXa7Hbt37w65zxvFdRAbKyqQ+53tSM7LA4RAcl4ecr+zHcaKiqgMvYWly7Fi3TeQkTULEAIZWbOwYt03eH2YiOJOeWE5au6vQW56LgQEctNzUXN/TcjXh4HIb4PY0dGB5557DmfOnMHSpUuxePFi/OhHPwq5X6+E3gbxbFMjmg7sw7UrvciYmYXSNWsZekREIeI2iB7BboMY16umJ7OwdDmDl4iINBXXU9NERETRjkFMRESkIQYxERGRhhjEREREGmIQExERaYhBTEREcSHS2yACwMMPP4xPfepTuOuuu/DEE09gZGQk5PN5MYiJiEgTSm9Tq8Y2iAcPHsS7776L9957D5cvX8Yvf/nLsI/JII5yF05046fPHMX3n3gDP33mKC6cUO6B6EREWplsm9pQRHobRACYPn06AM/GD8PDwxBChNyvV0I/0CPaXTjRjcb95+AeHgUADFx1onH/OQDAvHuVezA6EZHaJtqm1lhREdIx1doGsaysDG+//TZWrlyJv/3bvw2p17E4Io5ix+vafCHs5R4exfG6No06IiJSxkTb1ColUtsgNjQ0wGazwel04o033gi7TwZxFBu46gyqTkQUKybaplYpkdoGEQAMBgMqKyt9093hiOkgPtzSiQd2vIE5T9fjgR1v4HBLp9YtKWrajNSg6kREsWKibWpDFeltEAcGBmC7PmJ3u92or6/37egUjpgN4sMtndh86DQ6+4YgAXT2DWHzodNxFcbLKouQnOL/V5SckoRllUUadUREpIyJtqkNVaS3QRwcHMQjjzwCk8mExYsXIzs7G0888UTI/XrF7DaID+x4A519QzfVZ2em4ejTD4V17Ghy4UQ3jte1YeCqE9NmpGJZZREXahFRVOI2iB4Jsw1iV4AQnqgeq+bdm8PgJSKKYzE7NZ2XmRZUPaG1HgR2LQJqMj1vWw9q3REREV0Xs0FcXTYfaXqdXy1Nr0N12XyNOopSrQcBy5OA/UMA0vPW8iTDmIgoSsRsED+6ZDaeX1WM2ZlpEPBcG35+VTEeXTJb69aii3U74Lphut415KkTEZHmYvYaMeAJYwbvJOwdwdWJiEhVMTsipiky5gdXJyIiVSkSxEKIV4QQPUKI95Q4HinIvA3Q37CATZ/mqRMRxRE1tkH0euSRR7Bo0aKQzzWWUiPifwHwsELHIiWZVgMVLwHG2wEIz9uKlzx1IiINKb27nBrbIALAoUOHMG3aNMWOp0gQSyn/COCqEseiCDCtBja8B9T0ed4yhIlIY97d5bzPzvfuLhdOGKuxDeLAwAC+973v3fQ0rnDE9GItIiKKTRPtLhfqQ4zU2Abx2WefxTe/+U3ccsstIfUYiGqLtYQQ64QQzUKI5suXL6t1WiIiikJq7C6n9DaIp06dQltbGz73uc8p1iOgYhBLKfdKKUuklCWzZs1S67RERBSF1NhdTultEI8fP47m5mYUFBTgL/7iL3DhwgV85jOfCbtP3r5ERESqi8TucpHeBvGrX/0qurq6cOnSJfzpT3/CvHnz8Oabb4bcr5dSty/9AsBxAPOFEB1CiP9XieMSEVF8mndvDpY/tsA3Ap42IxXLH1sQ1iY3kd4GMVJidhtEIiKKLtwG0SPYbRA5NU1ERKQhBjEREZGGGMREREQaYhATERFpiEFMRESkIQYxERGRhhjEREQUF9TYBvEzn/kM5s+fj8WLF2Px4sXo6ekJ+Xxe3PSBiIg0cbapEU0H9uHalV5kzMxC6Zq1WFi6POTjeYP4a1/7moJd3mz//v0oKQl4S3BIOCImIiLVnW1qxJG9e3Ct9zIgJa71XsaRvXtwtqkx5GOqsQ1iJHBETEREqms6sA/uYf+dltzDTjQd2BfyqFiNbRAB4PHHH4dOp8Pf/M3fYOvWrRBChNSvF0fERESkumtXAl+LHa8eCqW3QQQ809KnT59GU1MTmpqa8LOf/SzsPhnERESkuoyZWUHVQ6H0NogAMHv2bE+fGRn44he/iLfffjvsPhnERESkutI1a5Gc4r/3cHJKKkrXBB6JTkWkt0F0u92+VdUulwu/+c1vsGjRopD79eI1YiIiUp33OrCSq6bHboO4cuVKPPXUU6ioqEBxcTFKSkr8tkGsrq5GUlIS9Ho9Xn75Zb/j1NbW4stf/jI2bdqEF154wVd3Op0oKyuDy+XCyMgIPvvZz+IrX/lKyP16cRvEGGO3WNCzazfcNhuSc3ORvWE9jBUVWrdFRMRtEK8LdhtEjohjiN1ige3ZbZAOBwDA3dUF27PbAIBhTEQUo3iNOIb07NrtC2Ev6XCgZ9dubRoiIqKwcUQc5erb61F7shbdg9040OVCoLvV3Dab6n0REZEyEiqIlX6cWqTVt9ej5lgNHCOeUXDvdGBW/82vS87NVbkzIiJSSsIEsfdxat4nuXgfpwYgasO49mStL4QB4OefEfin30oY3B+/RhgMyN6wPuhjH27pxM6G8+jqG0JeZhqqy+bj0SWzx/+E1oOAdTtg7wCM+YB5G2BaHfR5iYjIX8JcI57ocWrRqnuw2+/9o3fp8L/+SuDydABCIDkvD7nf2R70Qq3DLZ3YfOg0OvuGIAF09g1h86HTONzSGfgTWg8ClicB+4cApOet5UlPnYiIwpIwQazG49SUlpOec1Pt6F06/PdNt2Ph2TOY+4Y1pNXSOxvOY8g14lcbco1gZ8P5wJ9g3Q64hvxrriFPnYgoSqixDeLw8DDWrVuHefPmYcGCBfjVr34V8vm8EiaI1XicmtKqllbBoDP41Qw6A6qWVoV13K6+oaDqsHcEVycimoLBlh7YdryNjqebYNvxNgZbwtvbN9wgnornnnsO2dnZuHDhAs6cOYMHH3ww7GMmTBBH4nFqSrhwohs/feYovv/EG/jpM0dx4cTH09HlheWoub8Guem5EBDITc9Fzf01KC8sD+uceZlpQdVhzA+uTkQ0icGWHvQduoiRPs8lw5E+J/oOXQwrjNXYBvGVV17B5s2bAQBJSUnIygp/MJcwi7Ui8Ti1cF040Y3G/efgHh4FAAxcdaJx/zkAwLx7PdPS5YXlYQfvjarL5mPzodN+09Npeh2qy+YH/gTzNs814bHT0/o0T52IKAT9DZcgXaN+NekaRX/DJaQvyQ7pmJHeBrGvrw8A8Oyzz+LNN99EUVER9uzZg9tuuy2kfr0SZkQMeMJ43fd/gm8esGDd93+i+Wrp43VtvhD2cg+P4nhdW0TP++iS2Xh+VTFmZ6ZBAJidmYbnVxWPv2ratBqoeAkw3g5AeN5WvMRV00QUMu9IeKr1UCi9DaLb7UZHRwfuv/9+nDx5EsuWLcPGjRvD7jNhRsTRaOBq4G+48epKenTJ7IlvV7qRaTWDl4gUo8tMDRi6uszUAK8OzdhtEPV6PQoKCvy2Qfztb3+LrVu3wmw2Y9s2zwyfdxvEL37xixDC/xFKM2fOxC233IJVq1YBAD7/+c/jxz/+cdh9JtSIONpMmxH4G268OhFRvJheVgCh948goU/C9LKCkI8Z6W0QhRCoqKjAm2++CQCwWq248847Q+7Xi0GsoWWVRUhO8f8rSE5JwrLKIo06IiJSR/qSbGSumusbAesyU5G5am7I14cB/20Qq6ur8dhjj6G5uRnFxcXYt2+f3zaI99xzDxYvXoxvf/vb2Lp1q99xamtrMTQ0hE2bNt10jn/+539GTU0NTCYTfvazn+G73/1uyP16cRtEldm669De9iIcThsMqbm4RX4Frb8twsBVJ6bNSMWyyiLfQi0ioljCbRA9uA1iFLN11+HcuS0YHfWsPnY4uzCc9AJWPPkccnMqNe6OiIi0wKlpFbW3vegLYa/R0SG0t72oUUdERKQ1BrGKHM7A2xWOVyciovjHIFaRITXwdoXj1YmIKP4xiFVUWLQRSUn+j5FMSkpDYVH4N4QTEVFs4mItFXkXZI1dNV1YtJELtYiIEhiDWGW5OZUMXiKiCOjr68PPf/5zfO1rXwvp8wsKCtDc3DzuRg7Xrl1DaWmp7/2Ojg586Utfwu7du0M6nxeDmIiINNHa2gqr1Qq73Q6j0Qiz2QyTyRTy8bzbIIYaxJPJyMjwbSgBAHfffbfvcZfh4DViIiJSXWtrKywWi2/nI7vdDovFgtbW1pCPqcY2iF4XLlxAT0+P3wg5VBwRExGR6qxWK1wul1/N5XLBarWGPCqO9DaIYx04cAB/93d/d9PGEKHgiJiIiFQ3NvymUg+F0tsgjnXgwAF84QtfUKRPBjEREalubPhNpR6Ksdsgnjp1CrfddpvfNojFxcXYunUrtm/f7vsc7zaIE+3D8O6778LtduPuu+9WpE8GMRERqc5sNkOv1/vV9Ho9zGZzyMeM9DaIXr/4xS8UGw0DDGIiItKAyWRCRUWFbwRsNBpRUVER1qppNbZBBICDBw8qGsTcBpGIiBTBbRA9NNkGUQjxMIBaADoAP5JS7lDiuBTYhRPdOF7Xxj2MiYjiQNhT00IIHYDvA1gJ4E4AXxBC3BnucSmwCye60bj/HAauOgEAA1edaNx/DhdOdGvcGRERhUKJa8T3APizlLJdSjkM4AAAPsMxQo7XtcE9POpXcw+P4nhdm0YdERFROJQI4tkAPhzzfsf1GkWAdyQ81ToREUU31VZNCyHWCSGahRDNly9fVuu0cWfajNSg6kREFN2UCOJOALePeT//es2PlHKvlLJESlkya9YsBU6bmJZVFiE5xf+vLTklCcsqizTqiIiIwqFEEP8fAHOFEHOEECkA1gD4NwWOSwHMuzcHyx9b4BsBT5uRiuWPLeCqaSJKeN7dl0JVUFCA3t7eCV/zi1/8AsXFxTCZTHj44Ycnff1UhH37kpTSLYT4BoAGeG5fekVK+X7YndG45t2bw+Alophn665De9uLcDhtMKTmorBoY1j7tUd6G0S3242qqiqcOXMGWVlZ2LRpE/bs2YOampqwjqvINWIp5W+llPOklEVSyueUOCYREcUvW3cdzp3bAoezC4CEw9mFc+e2wNZdF/IxI70NopQSUkoMDg5CSon+/n7k5eWF3K8Xt0EkIiLVtbe9iNHRIb/a6OgQ2tteDHlUHOltEPV6PV5++WUUFxcjPT0dc+fOxfe///2Qeh2Lz5omIiLVOZy2oOqhUHobRJfLhZdffhktLS3o6uqCyWTC888/H3afDOIEM9jSA9uOt9HxdBNsO97GYEuP1i0RUQIypOYGVQ+F0tsgekfaRUVFEEJg9erVOHbsWNh9MogTyGBLD/oOXcRIn+fhHyN9TvQdusgwJiLVFRZtRFJSml8tKSkNhUUbQz5mpLdBnD17Ns6cOQPvszBef/11RTa54DXiBNLfcAnS5f94TOkaRX/DJaQvyVbsPIdbOrGz4Ty6+oaQl5mG6rL5eHQJH7ZGRB/zXgdWctX02G0QV65ciaeeegoVFRUoLi5GSUmJ3zaI1dXVSEpK8l33Hau2thZf/vKXsWnTJrzwwgu+el5eHr71rW/hL//yL6HX6/GJT3wC//Iv/xJyv17cBjGBdDzdNO7H8neUKnKOwy2d2HzoNIZcI75aml6H51cV4/+BHv0NlzDS54QuMxXTywoU/QWAiLTFbRA9NNkGkWKDLjPVNy19Y10pOxvO+0J4TlIv7k7uRLoYxsm6Ftw6OhdFw7cB+HhaHADDmIgSGq8RJ5DpZQUQev+/cqFPwvSyAsXO0dXnuR1hTlIvHtB/gGlJwxACSIITfxRn8Oekj1dEeqfFiYgSGYM4gaQvyUbmqrm+EbAuMxWZq+YqOiLNy/Qsvrg7uRPJwv969IgYRXNyu38twAidiCiRcGo6waQvyVZ8KvhsUyOaDuzDtSu9WJ1xK47ccjfSU4cDvnZAOPzeV3JanIi0J6WEEELrNjQTyrorjogpLGebGnFk7x5c670MSAl3/1WYe/8Alwz8rTVNGnx/VnpanIi0ZTAYcOXKlZDCKB5IKXHlyhUYDIbJXzwGR8QJaOwINmNmFkrXrMXC0uUhHavpwD64h/2nl6V7GNkD/4nBmXlwuVy+erIuGfckzweGwVXTRHEoPz8fHR0dSOQ95w0GA/Lz84P6HAZxgvGOYL3hea33Mo7s3QMAIYXxtSuBtwAb6foPVPzDP8FqtcJut8NoNMJsNsNkMoXePBFFNb1ejzlz5mjdRsxhECeYQCNY97ATTQf2hRTEGTOzPNPSAeomk4nBS0Q0CV4jTjDjjWDHq0+mdM1aJKf4L7hKTklF6ZqbH5hOREQ344g4wUw0gg2FdxSt1DVnIqJEwyBOMKVr1vpdIwbCH8EuLF3O4CUiChGDOI7Vt9ej9mQtuge7kZOeg6qlVSgvLQfAESwRUbRgEIfIbrGgZ9duuG02JOfmInvDehgrKrRuy6e+vR41x2rgGPE8QMM2aEPNsRoAQHlpedwF7+GWTtT82/voG/LcLnXrLXp8q+Iu7vpERFGPi7VCYLdYYHt2G9xdXZ6HWHR1wfbsNtgtFq1b86k9WesLYS/HiAO1J2s16ihyDrd0ovqX7/pCGAD+6yMXql99F4dbOjXsjIhocgziEPTs2g3p8A856XCgZ9dubRoKoHuwO6h6LNvZcB6u0Zuf5OMakdjZcF6DjoiIpo5BHAK3zRZUXQs56TlB1WOZd8enYD9GRBQNGMQhSM7NDaquhaqlVTDo/J93atAZULW0SqOOIse741OwHyMiigYM4hBkb1gPccNDvYXBgOwN67VpKIDywnLU3F+D3PRcCAjkpuei5v4alBeWa92a4qrL5kOfdPNuL3qdQHXZfA06IiKaOq6aDoF3dXQ0r5oGPGEcj8F7I+/KaK6aJqJYJLTYrqqkpEQ2Nzerfl4iIiItCCHekVKWBPoYp6aJiIg0xCAmIiLSEIOYiIhIQ1ysRUREBODCiW4c/+VpDAwkYVrSZSzL/h3mrSoHTKsjel4GMXm++eraMHDViWkzUrGssgjz7o2/B38QEY3nwoluNP7sfQwk92Iw6xIu65z4wD0f9x58DWVARMOYQZzgLpzoRuP+c3APjwIABq460bj/HAAwjIkoYRyva0PyJ45gwV2/Q2rqIJzOdFz698V4q6cIub/7CUwRDGJeI05wx+vafCHs5R4exfG6No06IiJSn0j/A25f/GsYDIMQAjAYBjF33lvIym6DdWhhRM/NIE5wA1edQdWJiOLRbYsPQ6cb8avpdCMomHMKdmRE9NwM4gQ3bUZqUHUioniUnHY1YD01dRDGNH1Ez80gTnDLKouQnOL/bZCckoRllUUadUREpD5DauBNe5zOdNw598GInptBnODm3ZuD5Y8t8I2Ap81IxfLHFnChFhEllMKijRgdSfGrjYzo0P3+SnQ3RzYquWqaMO/eHAYvESW03JxKvP7j93HbkgPQpQ4CAKTLAP2wEdcivGaGQUxERATAME2PpGQXxPVdVZMNg8j99M9gmKYH8FDEzsupaSIiIgDZpl8jKXnYr5aUPIxs068jel4GMREREYAR2RNUXSkMYiIiIoy/cnq8ulIYxERERPCsnE5KSvOrJSWlobBoY0TPyyAmIiKCZ+V0Ts4qALrrFR1yclYhN6cyoudlEBMREQGwddehu/sQAO+jLkfQ3X0Itu66iJ43rCAWQnxeCPG+EGJUCFGiVFNERERqa297EaOjQ3610dEhtLe9GNHzhjsifg/AKgB/VKAXIiIizTictqDqSgkriKWUZ6WU55VqhoiISCtcNU1ERKQhrVZNT/qISyHE7wEEehDxFinllK9gCyHWAVgHAHfccceUGyQiIlKDd3V0e9uLcDhtMKTmorBoY8RXTQspZfgHEeJNABullM1TeX1JSYlsbp7SS4mIiGKeEOIdKWXARc2cmiYiItJQWLsvCSE+B+B/ApgFoF4IcUpKWaZIZxQzLpzoxvG6NgxcdWLajFQsqyzitopERFMUVhBLKX8NILLbUlBUu3CiG437z8E9PAoAGLjqROP+cwDAMCYimgJOTVNYjte1+ULYyz08iuN1bRp1REQUWxjEFJaBq86g6kRE5I9BTGGZNiM1YP0jgx317fUqd0NEFHsYxBSWZZVFSE7x/zZyJQ3jWH4dao7VMIyJiCbBIKawzLs3B8sfW4CPDHZISFxLuYo/FB7AyPBZ/PXrM3F28w+w9+uP42xTo9atEhFFpbBWTRMBnjD+2blvQcLzcJg5nbfggdMzkTzq+T3vWu9lHNm7BwCwsHT5lI452NKD/oZLGOlzQpeZiullBUhfkh2ZL4CISEMcEZMictI/vlXp7vO3+kLYyz3sRNOBfVM61mBLD/oOXcRIn2fB10ifE32HLmKwpUe5homIogSDmBRRtbQKBp0BAJDu0AV8zbUrvVM6Vn/DJUiX/y1R0jWK/oZLYfVIRBSNODVNiigvLAcA1J6sxaBhBNMcN39rZczMmtKxvCPhqdaJiGIZR8SkmPLCchz52yNY/Y9PITnF/7am5JRUlK5ZO6Xj6DID3xI1Xp2IKJYxiElxC0uXY8W6byAjaxYgBDKyZmHFum9MeaHW9LICCL3/t6bQJ2F6WUEEuiUi0hanpikiFpYun3Lw3si7OlrpVdP17fWoPVmL7sFu5KTnoGpplW9KnYhIKwxiikrpS7IVvV2pvr0eNcdq4BhxAABsgzbUHKsBAIYxEWmKU9OUEGpP1vpC2Msx4kDtyVqNOiIi8mAQU0LoHuwOqk5EpBYGMSWEsQ8cmUqdiEgtDGJKCGMfOOJl0BlQtbRKo46IiDy4WIsSwtgHjnDVNBFFEwYxJYzywnIGLxFFHU5NU0ypb6/HildXwPRTE1a8uoL7HRNRzOOImGIG7wUmonjEETHFDN4LTETxiEFMMYP3AhNRPGIQU8zgvcBEFI8YxBQzeC8wEcUjLtaimMF7gYkoHjGIKabwXmAiijecmiYiItIQR8RERJTwWltbYbVaYbfbYTQaYTabYTKZVDk3g5iIiBJaa2srLBYLXC4XAMBut8NisQCAKmHMqWkiIkpoVqvVF8JeLpcLVqtVlfMziImIKKHZ7fag6kpjEBMRUUIzGo1B1ZXGICYiooRmNpuh1+v9anq9HmazWZXzc7EWERElNO+CLK6aJiIi0ojJZFIteG/EqWkiIiINMYiJiIg0xCAmIiLSEK8RE01isKUH/Q2XMNLnhC4zFdPLCpC+JFvrtogoTnBETDSBwZYe9B26iJE+JwBgpM+JvkMXMdjSo3FnRKQUu8WCiw+ZcXbhnbj4kBn264+3VAuDmGgC/Q2XIF2jfjXpGkV/wyVtGiIiRdktFtie3QZ3VxcgJdxdXbA9u03VMGYQE03AOxKeap2IYkvPrt2QDodfTToc6Nm1W7UeGMREE9Blpgasi9QRTaeyiEgZbpstqHokMIiJJjC9rABCf8M/kySJobf/t6ZTWUSkjOTc3KDqkcAgJppA+pJsZK6a6xsZ6zJTMXzhEFz/ftTvdWpPZRGRMrI3rIcwGPxqwmBA9ob1qvUQ1u1LQoidACoADANoA/C4lLJPgb6Iokb6kmy/25XOLvyHgK9TcyqLiJRhrKgA4LlW7LbZkJybi+wN6311NYR7H/HrADZLKd1CiH8GsBnAU+G3RRS9knNzPdPSAepEFHuMFRWqBu+NwpqallIekVK6r7/7FoD88Fsiim7RMJVFRPFDySdrfRnAv473QSHEOgDrAOCOO+5Q8LRE6oqGqSwiih9CSjnxC4T4PYCcAB/aIqWsu/6aLQBKAKySkx0QQElJiWxubg6hXSIiotgjhHhHSlkS6GOTjoillJ+d5OD/AOCvAZinEsJERET0sXBXTT8MYBOAB6WUHynTEhERUeII9z7iPQAyALwuhDglhPj/FeiJiIgoYYQ1IpZSflKpRoiIiBIRn6xFRESkISVvXyKiSRxu6cTOhvPo6htCXmYaqsvm49Els7Vui4g0xCAmUsnhlk5sPnQaQ64RAEBn3xA2HzoNAAxjogTGqWkilexsOO8LYa8h1wh2NpzXqCMiigYMYiKVdPUNBVUnosTAICZSSV5mWlB1IkoMDGIilVSXzUeaXudXS9PrUF02X6OOiCgacLEWkUq8C7K4apqIxmIQE6no0SWzGbxE5IdT00RERBpiEBMREWmIQUxERKQhBjEREZGGGMREREQaYhATERFpiEFMRESkIQYxERGRhhjEREREGmIQExERaYhBTEREpCEGMRERkYa46QMR3WSwpQf9DZcw0ueELjMV08sKkL4kW+u2iOISg5iI/Ay29KDv0EVI1ygAYKTPib5DFwGAYUwUAZyaJiI//Q2XfCHsJV2j6G+4pE1DRHGOQUwUaa0HgV2LgJpMz9vWg1p3NKGRPmdQdSIKD6emiSKp9SBgeRJwDXnet3/oeR8ATKu162sCuszUgKGry0zVoBui+McRMVEkWbd/HMJeriFPPUpNLyuA0Pv/aBD6JEwvK9CmIaI4xxExUSTZO4KrRwHvgiyumiZSB4OYKJKM+Z7p6ED1KJa+JJvBS6QSTk0TRZJ5G6BP86/p0zz1KHO4pRMP7HgDc56uxwM73sDhlk6tWyJKCBwRE0WSd0GWdbtnOtqY7wnhKFuodbilE5sPncaQawQA0Nk3hM2HTgMAHl0yW8vWiOIeg5go0kyroy54b7Sz4bwvhL2GXCPY2XCeQUwUYQxiIo2cbWpE04F9uHalFxkzs1C6Zi0Wli7XpJeuvqGg6kSkHF4jJtLA2aZGHNm7B9d6LwNS4lrvZRzZuwdnmxo16ScvMy2oOhEph0FMpKLBlh7YdryNab/R4eHsx3FH+kLfx9zDTjQd2KdJX9Vl85Gm1/nV0vQ6VJfN16QfokTCqWkilYzdTEEIgXS9EZ/OWgkA+I/BswCAa1d6NenNex14Z8N5dPUNIS8zDdVl83l9mEgFDGIilQTaTCE5SQ/TrQ/6gjhjZpYWrQHwhDGDl0h9nJomUsl4mybckjwdAJCckorSNWvVbImIogBHxEQqGU1LRtKQ+6b6R+5+ZGTN0nTVNBFph0FMpJIzQ24skBLJQvhqbinx73oj1n33Jxp2RkRaYhATqaStzwWHXuBOgw5pScDQKHDGMYJOl9S6NSLSEIOYSCXTZqSi86oTnS73TXUiSlxcrEWkkmWVRUhO8f8nl5yShGWVRRp1RETRgCNiIpXMuzcHAHC8rg0DV52YNiMVyyqLfHUiSkwMYiIVzbs3h8FLRH7CmpoWQnxHCNEqhDglhDgihMhTqjEiIqJEEO414p1SSpOUcjGA3wCIvt3OiYiIolhYQSyl7B/zbjoA3odBREQUhLCvEQshngOwFoAdAB8LREREFAQh5cSDWCHE7wEEWl2yRUpZN+Z1mwEYpJTfGuc46wCsA4A77rjj7g8++CDkpokS0eGWTu6ORBSjhBDvSClLAn5ssiAO4iR3APitlHLRZK8tKSmRzc3NipyXKBEcbunE5kOnMeQa8dXS9Do8v6qYYUwUAyYK4nBXTc8d824lgHPhHI+IAtvZcN4vhAFgyDWCnQ3nNeqIiJQS7jXiHUKI+QBGAXwA4InwWyKiG3X1DQVVJ6LYEVYQSyn/RqlGiGh8eZlp6AwQunmZaRp0Q0RK4rOmiWJAddl8pOl1frU0vQ7VZfM16oiIlMJHXBLFAO+CLO+q6aXT+rFU34lTdW/h3980wmw2w2QyadwlEYWCQUwUIx5dMhuPLpmN1tZWWCwWuIZcAAC73Q6LxQIADGOiGMSpaaIYY7Va4XK5/GoulwtWq1WjjogoHAxiohhjt9uDqhNRdGMQE8UYo9EYVJ2IohuDmCjGmM1m6PV6v5per4fZbNaoIyIKBxdrEcUY74Isq9UKu90Oo5GrpoliGYOYKAaZTKaYCd769nrUnqxF92A3ctJzULW0CuWF5Vq3RRQ1GMREFDH17fWoOVYDx4gDAGAbtKHmWA0AMIyJruM1YiKKmNqTtb4Q9nKMOFB7slajjoiiD4OYiCKme7A7qDqRUurb67Hi1RUw/dSEFa+uQH17vdYtjYtT00QUMTnpObAN2gLWiSKlvr0eDXu3YOsbTszsB65M/xCvPrQFWBedl0Q4IiaiiKlaWgWDzuBXM+gMqFpapVFHlAj+9Mr/wOO/cWJWvyfkZvUDj//GiT+98j+0bi0gjoiJ4oStuw7tbS/C4bTBkJqLwqKNyM2p1LQn7+iDq6ZJTSuPXIXB7V8zuD11/HdtepoIg5goDti663Du3BaMjnr2LHY4u3Du3BYAiIowZvCSmrL6g6trjVPTRHGgve1FXwh7jY4Oob3tRY06ItKOOzszqLrWGMREccDhvHlB1ER1onj2iepnMJrq/xjY0VQ9PlH9jEYdTYxT00RxwJCaC4ezK2CdKNEYKyoAAD27dsNtsyE5NxfZG9b76tGGQUwUBwqLNvpdIwaApKQ0FBZt1LArIu0YKyqiNnhvxCAmigPeBVnRtmqaiCbHICaKE7k5lQxeohjExVpEREQaYhATERFpiEFMRESkIV4jJqKYdbapEU0H9uHalV5kzMxC6Zq1WFi6XOu2iILCICaimHS2qRFH9u6Be9gJALjWexlH9u4BAIYxxRROTRNRTGo6sM8Xwl7uYSeaDuzTqCOi0DCIiSgmXbvSG1SdKFpxapqIIqa1tRVWqxV2ux1GoxFmsxkmk0mRY2fMzMK13ssB60SxhCNiIoqI1tZWWCwW2O12AIDdbofFYkFra6sixy9dsxbJKal+teSUVJSuWavI8YnUwiAmooiwWq1wuVx+NZfLBavVqsjxF5Yux4p130BG1ixACGRkzcKKdd/gQi2KOZyaJqKI8I6Ep1oPxcLS5QxeinkcERNRRBiNxqDqRImKQUxEEWE2m6HX+2/OrtfrYTabNeqIKDpxapoowURyJfNY3mOqcS6iWMYgJkog3pXM3kVU3pXMAMYNyPr2etSerEX3YDdy0nNQtbQK5YXlUzqfyWRi8BJNglPTRAkk2JXM9e31qDlWA9ugDRIStkEbao7VoL69Xo12iRICg5gogQS7krn2ZC0cIw6/mmPEgdqTtYr3RpSoGMRECSTYlczdg91B1YkoeAxiogQS7ErmnPScoOpEqmo9COxaBNRket62HtS6o5BwsRZRAgl2JXPV0irUHKvxm5426AyoWlqlSr9E42o9iNbDL8E6ugJ2ZMBovwbz4ZdgAgDTaq27CwqDmCjBBLOS2bs6OtRV01NxuKUTOxvOo6tvCHmZaagum49Hl8xW7PgUn1p/9xNYRh+EC54ZHjumwzL6IPC7n8DEICaieFJeWK5o8I51uKUTmw+dxpBrBADQ2TeEzYdOAwDDmCZkHVroC2EvF/SwDi1ErN0wx2vERDSp+vZ6rHh1BUw/NWHFqysUu31pZ8N5Xwh7DblGsLPhvCLHp/hlR0ZQ9WimSBALIb4phJBCCG4EShRnInkvcVffUFB1Ii9jmj6oejQLO4iFELcDWAHgP8Jvh4iiTSTvJc7LTAuqTuRlXvkI9DrhV9PrBMwrH9Goo9ApMSLeBWATAKnAsYgoykTyXuLqsvlI0+v8aml6HarL5od97IDi5HYX8iw6rKj8nO8eeKPRiIrKz8XkI1XDWqwlhKgE0CmlfFcIMenriSj25KTnwDZoC1gPl3dBliqrplsPApYnAdf1aW/7h573gZi73YU84uVZ5pMGsRDi9wAC/YvbAuAZeKalJyWEWAdgHQDccccdQbRIRFqK9L3Ejy6Zrc4Kaev2j0PYyzXkqTOISUOTBrGU8rOB6kKIYgBzAHhHw/kATgoh7pFS3jRnJaXcC2AvAJSUlHAamyhGqHEvsSrsHcHViVQS8tS0lPI0gGzv+0KISwBKpJS9CvRFRFEkkvcSq8aY75mODlQn0hDvIyaixGDeBuhvWI2tT/PUiTSk2JO1pJQFSh2LiEhx3uvA1u2e6WhjvieEeX2YNMZHXBJR4jCtZvDGMLvFgp5du+G22ZCcm4vsDethrKjQuq2wMYiJiCjq2S0W2J7dBunwrN53d3XB9qznskKshzGvERMRUdTr2bXbF8Je0uFAz67d2jSkIAYxERFFPbft5ofKTFSPJQxiIopqdosFFx8y4+zCO3HxITPsFovWLZEGknNzg6rHEgYxEUUt73VBd1cXIKXvuiDDOPFkb1gPYTD41YTBgOwN67VpSEEMYiKKWvF8XZCCY6yoQO53tiM5Lw8QAsl5ecj9zvaYX6gFcNU0EUWxeL4uSMEzVlTERfDeiCNiIopa8XxdkMaXaOsCGMREFLXi+bogBZaI6wIYxEQUteL5uiAFlojrAniNmIiiWrxeF6TAEnFdAEfEREQUNRJxXQCDmIiINHPjwqxpD/5lwq0LYBATEZEmAi3Msv/6MIyfezSh1gXwGjEREWlivIVZA3/4I+a+YdWoK/VxRExERJpIxIVZgXBETERxz9Zdh/a2F+Fw2mBIzUVh0Ubk5lRq3VbCS87N9UxLB6gnEo6IiSiu2brrcO7cFjicXQAkHM4unDu3BbbuOq1bi3u27jocPVoK6xufxNGjpTf9P+cDWzwYxEQUt2zddThzphqjo0N+9dHRIbS3vahRV4lhKr8A8YEtHpyaJqK45A0CYCTgxx3OxLoOqbb2thfH/QVo7GUBPrCFQUxEcSpQEIxlSE2s65BqG+8XHW99sKUH/Q2XMNLnhC4zFdPLCpC+JFvNFqMGp6aJKC5NNOJNSkpDYdFGFbtJPOP9omNIzcVgSw/6Dl3ESJ8TADDS50TfoYsYbOlRs8WowSAmorg0/ohXhwULnuOq6QgrLNqIpKQ0v5r3F6D+hkuQrlG/j0nXKPobLqnYYfRgEBNRXBovCO68cydDWAW5OZVYsOA5GFLzAAgYUvN8vwB5R8I3Gq8e73iNmIhUU99ej9qTtege7EZOeg6qllahvLA8Iufyhi3vH9ZObk5lwP/fuszUgKGry0xVo62owyAmIlXUt9ej5lgNHCOeRxraBm2oOVYDABENYwZv9JleVoC+Qxf9pqeFPgnTywq0a0pDDGIiUkXtyVpfCHs5RhyoPVkbsSAO19mmRjQd2IdrV3qRMTMLpWvWYmHpcq3binne1dFcNe3BICYiVXQPdgdV19rZpkYc2bsH7mHPFOq13ss4sncPADCMFZC+JDthg/dGDGIiUkVOeg5sgzffUpSTnqNBN5NrOrDPF8Je7mEnmg7sS9ggvnCiG8fr2jBw1YlpM1KxrLII8+6Nzr+/WMJV00SkiqqlVTDo/J8rbNAZULW0SqOOJnbtSq/f+5lFdtz5xYsoXPXHgM9NjncXTnSjcf85DFz1/HIycNWJxv3ncOFEdM5oxBIGMRGporywHDX31yA3PRcCArnpuai5vyZqrw9nzMzy/TmzyI47HrQhJcMNIZCQG0ccr2uDe9j/3l/38CiO17Vp1FH84NQ0EammvLA8aoP3RqVr1vquEefd24MkvfT7eKDnJscz70h4qnWaOgYxEVEA3uvATQf2QT/tbMDXqLFxRLQ8k3najNSAoTttRmLe+6skBjERRbXW1lZYrVbY7XYYjUaYzWaYTCZVzr2wdDkWli7H0aOl17fz8xfpjSO8z2T23m/rfSYzANXDeFllERr3n/Obnk5OScKyyiJV+4hHDGIiijreUeD5/kv4U8o5uOH54W+322GxWABAtTAGPI/LPHdui99uTmpsHDHRM5nTdW8C1u2AvQMw5gPmbYBpdcR68a6O5qpp5TGIiSiqjB0FNqe0+0LYy+VywWq1qhrEWj0uc/xnMjsAy5OA6/ovBvYPPe8DEQ/jCYO39aCqvxzECwYxEUWVsaPAAeEI+Bq73a5mSwC0eVzmuM9kTrr6cQh7uYY8IahV8LUe1OSXg3jA25eIKKqMDZ5p0hDwNUajUa12NDW9rABC7/9jWuiTMD3pJ34126wUHL3nVlhNH2l3j7N1+/i/HNCEGMREFFXG7sBT4i6ETvr/mNLr9TCbzWq3pYn0JdnIXDXX9/9El5mKzFVzkT7z43t3bbNScG5eBhwGHSCEdvc42zuCq5MPp6aJKKqM3Znnk6O5gAto1rdjQDhUXzUdDQI+k1m3zTcN3D4nHaM64fdhTe5xNuZ7pqMD1WlCDGIiiio37swzf3oBPl32GW4QMJb3mqt1OxypHwV8iRr3OPsxb/O/RgwA+jRPnSbEICaiqMOdeabAtBowrYZBo3ucA/YDcNV0CBjEREQxbLJ7nFXdMen6LwcUHAYxEVEMm+geZ++OSd6nYXl3TALAB3FEkbCCWAhRA+ArAC5fLz0jpfxtuE0REdHUjXeP80Q7JjGIo4cSI+JdUsoXFTgOEREpiDsmxQbeR0xEFKfG2xmJOyZFFyWC+BtCiFYhxCtCiFsVOB4RESlgWWURklP8f8xzx6ToM2kQCyF+L4R4L8B/lQBeBlAEYDEAG4DvTnCcdUKIZiFE8+XLl8d7GRERKWTevTlY/tgC3wh42oxULH9sAa8PRxkhpVTmQEIUAPiNlHLRZK8tKSmRzc3NipyXiIgo2gkh3pFSlgT6WFhT00KIsXeMfw7Ae+Ecj4iIKNGEu2r6BSHEYgASwCUA/xRuQ0RE8aK1tRVWqxV2uz0hn5NNUxNWEEsp/16pRoiI4klrayssFgtcLhcAzx7KFosFABjG5Ie3LxERRYDVavWFsJfL5YLVatWoI4pWDGIiSiytB4Fdi4CaTM/b1oMROY3dbg+qTomLQUxEiaP1oGerPvuHAKTnreXJiISx0WgMqk6Ji0FMRInDut1/v1zA8751u+KnMpvN0Ov1fjW9Xg+z2az4uSi2cfclIkoc9o7g6mHwLsjiqmmaDIOYiBKHMf/6tHSAegSYTCYGL02KU9NElDjM2wB9mn9Nn+apE2mEQUxEicO0Gqh4CTDeDkB43la85KkTaYRT00SUWEyrGbwUVRjEREQKO9vUiKYD+3DtSi8yZmahdM1aLCxdrnVbFKUYxERECjrb1Igje/fAPewEAFzrvYwje/cAAMOYAuI1YiIiBTUd2OcLYS/3sBNNB/Zp1BFFOwYxEZGCrl3pDapOxCAmIlJQxsysoOpEDGIiIgWVrlmL5JRUv1pySipK16zVqCOKdlysRUSkIO+CLK6apqliEBMRKWxh6XIGL00Zp6aJiIg0xCAmIiLSEIOYiIhIQwxiIiIiDTGIiYiINMQgJiIi0hCDmIiISEMMYiIiIg0xiImIiDTEICYiItIQg5iIiEhDDGIiIiINMYiJiIg0xCAmIiLSEIOYiIhIQ0JKqf5JhbgM4IMJXpIFoFeldrTAry+28euLbfz6Ylusfn2fkFLOCvQBTYJ4MkKIZillidZ9RAq/vtjGry+28euLbfH49XFqmoiISEMMYiIiIg1FaxDv1bqBCOPXF9v49cU2fn2xLe6+vqi8RkxERJQoonVETERElBCiKoiFEJ8XQrwvhBgVQpTc8LHNQog/CyHOCyHKtOpRKUKIxUKIt4QQp4QQzUKIe7TuSWlCiP9PCHHu+t/pC1r3EwlCiG8KIaQQIkvrXpQkhNh5/e+uVQjxayFEptY9KUEI8fD1nyF/FkI8rXU/ShJC3C6EaBRCnLn+b65K656UJoTQCSFahBC/0boXJUVVEAN4D8AqAH8cWxRC3AlgDYC7ADwM4AdCCJ367SnqBQDfllIuBrDt+vtxQwixHEAlgE9JKe8C8KLGLSlOCHE7gBUA/kPrXiLgdQCLpJQmABcAbNa4n7Bd/5nxfQArAdwJ4AvXf7bECzeAb0op7wRwH4Cvx9nXBwBVAM5q3YTSoiqIpZRnpZTnA3yoEsABKaVTSvnvAP4MINZHkBLA9Ot/NgLo0rCXSPgqgB1SSicASCl7NO4nEnYB2ATP32VckVIekVK6r7/7FoB8LftRyD0A/iylbJdSDgM4AM/PlrggpbRJKU9e//M1eAJrtrZdKUcIkQ+gHMCPtO5FaVEVxBOYDeDDMe93IPa/wdYD2CmE+BCe0WLMjzhuMA9AqRDihBDiD0KIT2vdkJKEEJUAOqWU72rdiwq+DOB3WjehgHj8ORKQEKIAwBIAJzRuRUm74fnFd1TjPhSXrPYJhRC/B5AT4ENbpJR1avcTSRN9rQDMADZIKX8lhFgN4McAPqtmf+Ga5OtLBjADnimyTwM4KIQolDG0TH+Sr+8ZeKalY9ZU/i0KIbbAM+W5X83eKHRCiGkAfgVgvZSyX+t+lCCE+GsAPVLKd4QQn9G4HcWpHsRSylDCphPA7WPez79ei2oTfa1CiH3wXO8AgF8iBqdbJvn6vgrg0PXgfVsIMQrPM2Ivq9VfuMb7+oQQxQDmAHhXCAF4vh9PCiHukVJ2q9hiWCb7tyiE+AcAfw3AHEu/QE0gJn+OBEMIoYcnhPdLKQ9p3Y+CHgDwiBDirwAYAEwXQvxvKeWXNO5LEbEyNf1vANYIIVKFEHMAzAXwtsY9hasLwIPX//wQgIsa9hIJhwEsBwAhxDwAKYjNB7XfREp5WkqZLaUskFIWwDPFuTSWQngyQoiH4ZkGfERK+ZHW/Sjk/wCYK4SYI4RIgWcB6L9p3JNihOe3wh8DOCul/J7W/ShJSrlZSpl//d/bGgBvxEsIAxqMiCcihPgcgP8JYBaAeiHEKSllmZTyfSHEQQBn4Jkm+7qUckTLXhXwFQC1QohkAA4A6zTuR2mvAHhFCPEegGEA/y1ORlWJYg+AVACvXx/1vyWlfELblsIjpXQLIb4BoAGADsArUsr3NW5LSQ8A+HsAp4UQp67XnpFS/la7lmgq+GQtIiIiDcXK1DQREVFcYhATERFpiEFMRESkIQYxERGRhhjEREREGmIQExERaYhBTEREpCEGMRERkYb+L0a+VQSRg7jdAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "plt.figure(figsize=(8, 8))\n", + "for task in range(9):\n", + " good_ids = dataset[dataset.task == task].id.values\n", + " indexes = np.where([n in good_ids for n in ids])[0]\n", + " xs = x[indexes].T\n", + " plt.plot(xs[0], xs[1], 'o', label='task ' + str(task))\n", + " \n", + "plt.legend()\n", + "plt.savefig('SMALLscaaU_pooling100task')" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "2cb6b9b6-a3d9-4dea-858f-26185a2aed96", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAeIAAAHSCAYAAAAwk8gOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAABbCklEQVR4nO3deXxU5dn/8c+dhSQCBlkTQIVQAQ0JIYZNBMHIosiiVMXaUmzV+lgr8BSqlErRasVCC7TWWvoTkUfKIlgwRQsSsUSg0LAYRAIIRVkSSaAJi0mYJOf3xzAjA5OQZTJnJvm+Xy9ekHvmnHPNJOSaezn3ZSzLQkREROwRYncAIiIiDZkSsYiIiI2UiEVERGykRCwiImIjJWIREREbKRGLiIjYKMyOi7Zs2dLq0KGDHZcWERHxu+3bt+dbltXK22O2JOIOHTqQmZlpx6VFRET8zhjzRUWPaWhaRETERkrEIiIiNlIiFhERsZESsYiIiI2UiEVERGykRCwiImIjJWIREREbKRGLiIjYSIlYRETERkrEIiIiNlIiFhERsZESsYiIiI2UiEVERGxU60RsjLnWGLPBGPOZMWaPMWaCLwITERFpCHxRBrEU+KllWTuMMU2B7caYDyzL+swH5xYREanXat0jtiwrx7KsHRf+fQbYC7Sr7XmDUWFaGgduT2XvjTdx4PZUCtPS7A5JREQCnC96xG7GmA5AD2Crl8ceAx4DuO6663x5WZ/LyV3NoYOzKS7JITIilrhOk4mNGVXpMYVpaeQ8Ox2ruBiA0uPHyXl2OgDRI0bUecwiIhKcfLZYyxjTBFgJTLQs6/Slj1uWNd+yrBTLslJatWrlq8v6XE7uarKzp1FcchywKC45Tnb2NHJyV1d63Ik5c91J2MUqLubEnLl1F6yIiAQ9nyRiY0w4ziS82LKsd3xxTrscOjib8vIij7by8iIOHZxd6XGlOTnVahcREQHfrJo2wOvAXsuyflf7kOxVXOI9cVbU7hIWG1utdhEREfBNj7gf8D3gdmPMrgt/7vLBeW0RGeE9cVbU7tJ60kRMZKRHm4mMpPWkib4KTURE6qFaL9ayLOtjwPggloAQ12ky2dnTPIanQ0KiiOs0udLjXAuyTsyZS2lODmGxsbSeNFELtUREpFI+XTVdH7hWR1d31TQ4k7ESr4iIVIcSsRexMaOqlHhFRERqS3tNi4iI2EiJWERExEZKxCIiIjZSIhYREbGRErGIiIiNlIj9YM2hNQxZMYTENxMZsmIIaw6tsTskEREJELp9qY6tObSGGZtnUFzmLAiRcy6HGZtnADA8briNkYmISCBQj7iOzdsxz52EXYrLipm3Y55NEYmISCBRIq5juedyq9UuIiINixJxHYtpHFOtdhERaViUiOvYhOQJRIZ6VmWKDI1kQvIEmyISEZFAosVadcy1IGvejnnknsslpnEME5InaKGWiIgASsR+MTxuOMVX9eWlQznsLnHwy9xwiq86xZiY5naHJiIiNlMi9oOVuaeYvO8IReUWAEdLHEzedwRAyVhEpIGr94l41c5jzFq7j+MFRbRtFsWUoV0Y3aOdX2N46VCOOwm7FJVbvHQo57JEnJWVRXp6OoWFhURHR5OamkpiYqI/wxURET+q14l41c5jTH1nN0WOMgCOFRQx9Z3dAIzu0Y6c3NUcOjib4pIcIiNiies0uU7qEB8rcVSpPSsri7S0NBwOZ3thYSFpaWkASsYiIvVUvV41PWvtPncSdilylDFr7T5ycleTnT2N4pLjgEVxyXGys6eRk7va53G0iwivUnt6ero7Cbs4HA7S09N9HpOIiASGep2IjxcUVdh+6OBsyss9Hy8vd7b72tS4WKJCjEdbVIhhalysR1thYaHX4ytqFxGR4FevE3HbZlEVtheX5Hh9rKL22hgT05zZXa6lfUQ4BmgfEc7sLtdeNj8cHR3t9fiK2kVEJPjV60Q8ZWgXosJDPdqiwkOZMrQLkRGxXo+pqL22xsQ0J/OWeHIGJZF5S7zX1dKpqamEh3sOV4eHh5OamlonMYmIiP3qdSIe3aMdL92bQLtmURigXbMoXro3gdE92hHXaTIhIZ495pCQKOI6TbYnWJwLskaMGOHuAUdHRzNixAgt1BIRqceMZVlXfpaPpaSkWJmZmX6/7qX8tWpaREQaNmPMdsuyUrw9Vq9vX7qS2JhRSrwiImKrej00LSIiEuiUiEVERGykRCwiImIjJWIREREbKRGLiIjYSIlYRETERkrEIiIiNmrQ9xEHg/1bc9my+iBnT5XQpHkEfUd1onPvGLvDEhERH1EiDmD7t+ayYXE2pefLATh7qoQNi7MBlIxFROoJDU0HsC2rD7qTsEvp+XK2rD5oU0QiIuJrSsQB7Oypkmq1i4hI8AnuRJy1HOZ0gxnNnH9nLbc7Ip9q0jyiWu0iIhJ8gjcRZy2HtKeg8AhgOf9Oe6peJeO+ozoR1sjzWxTWKIS+ozrZFJGIiPha8Cbi9OfBUeTZ5ihyttcTnXvHMOihru4ecJPmEQx6qKsWaomI1CPBu2q68Gj12oNU594xSrwiIvVY8Cbi6PYXhqW9tIuHvRkbyFi6iDMn82naoiX9x47jxv6D7A5LREQI5qHp1OkQHuXZFh7lbBe3vRkbWDf/Fc7k54FlcSY/j3XzX2Fvxga7QxMREYI5ESfeDyN+D9HXAsb594jfO9vFLWPpIkrPe97uVHq+hIyli2yKSERELha8Q9PgTLpKvJU6czK/Wu0iIuJfwdsjlipp2qJltdpFRMS/fJKIjTELjDEnjDGf+uJ84jv9x44jrJHnBiBhjSLoP3acTRGJiMjFfNUjXggM89G5xIdu7D+IIY89SdOWrcAYmrZsxZDHntSqaRGRAOGTOWLLsjYaYzr44lziezf2H6TEKyISoDRHLCIiYiO/JWJjzGPGmExjTGZeXp6/LisiIhLQ/JaILcuab1lWimVZKa1atfLXZUVERAKahqZFRERs5Kvbl5YAW4Auxpijxpgf+uK8IiIi9Z2vVk0/6IvziIiINDQamhYREbGRErGIiIiNlIhFRERspEQsIiJiIyViERERGykRi4iI2EiJWERExEZKxCIiIjZSIhYREbGRErGIiIiNlIhFRERspEQsIiJiIyViERERGykRi4iI2MgnZRDFf7KyskhPT6ewsJDo6GhSU1NJTEy0OywREakhJeIgkpWVRVpaGg6HA4DCwkLS0tIAlIxFRIKUhqaDSHp6ujsJuzgcDtLT022KSEREaks94gC3N2MDGUsXceZkPme6JIMxlz2nsLDQhshERMQXGlQiXpl7ipcO5XCsxEG7iHCmxsUyJqa53WFVaG/GBtbNf4XS8yUAGMd5rEYRlz0vOjra36GJiIiPNJhEvDL3FJP3HaGo3ALgaImDyfuOAARsMs5YusidhAEa5R2jJPZ6CAl1t4WHh5Oamlrtc+fkrubQwdkUl+QQGRFLXKfJxMaMqviArOWQ/jwUHoXo9pA6HRLvr/Z1RUTEU4OZI37pUI47CbsUlVu8dCjHpoiu7MzJfI+vG50+RUTOFxiHMzlHR0czYsSIai/UysldTXb2NIpLjgMWxSXHyc6eRk7uau8HZC2HtKeg8AhgOf9Oe8rZLiIitdJgesTHShzVag8ETVu05Ex+nkdbo9OnaNEolMf++EaNz3vo4GzKy4s82srLizh0cLb3XnH68+DwfD6OIme7esUiIrXSYHrE7SLCq9UeCPqPHUfYJXPCYY0i6D92XK3OW1zifRSgonYKj1avXUREqqzB9IinxsV6zBEDRIUYpsbF2hgVnNt5gtNrD1NWUEJoswiuHtqBxj1aA3Bj/0EA7lXTTVu0pP/Yce72moqMiL0wLH15u1fR7S8MS3tpFxGRWmkwidi1ICuQVk2f23mCgncOYDnKASgrKKHgnQMAHsm4ton3UnGdJpOdPc1jeDokJIq4TpO9H5A63TknfPHwdHiUs11ERGqlwSRicCbjQFohfXrtYXcSdrEc5Zxee9idiOuCax64yqumXfPAWjUtIuJzDSoRB5qygpJqtftSbMyoym9XulTi/Uq8IiJ1oMEs1gpEoc0u35yjsnYREal/lIhtdPXQDphwz2+BCQ/h6qEd7AlIRET8TkPTfnZpGcNbe/ak/WcRXldNi4hI/adE7Efeyhiu2/3PC7tj9bI5OhERsYOGpv1IZQxFRORSSsR+VFG5QpUxFBFpuJSI/aiicoUqYygi0nApEftRamoq4eGee1vXtIyhiIjUD1qs5UeucoUXr5pOTU2tdhlDERGpP5SI/SwxMVGJV0RE3DQ0LSIiYiMlYhERERspEYuIiNhIiVhERMRGSsQiIiI2UiIWERGxkRKxiIiIjZSIRUREbOSTRGyMGWaM2WeM+dwY84wvzikVy8ldzaZN/Un/8Fts2tSfnNzVdockIiI1VOudtYwxocAfgcHAUeDfxph3Lcv6rLbnlsvl5K4mO3sa5eVFABSXHCc7exoAsTGj7AxNRERqwBc94l7A55ZlHbIs6zywFFBGqCOHDs52J2GX8vIiDh2cbVNEIiJSG75IxO2AIxd9ffRCm9SB4pKcarWLiEhg89tiLWPMY8aYTGNMZl5enr8uW+9ERsRWq11ERAKbLxLxMeDai75uf6HNg2VZ8y3LSrEsK6VVq1Y+uGzDFNdpMiEhUR5tISFRxHWabFNEIiJSG74og/hv4AZjTEecCXgs8B0fnFe8cC3IOnRwNsUlOURGxBLXabIWaomIBKlaJ2LLskqNMU8Ca4FQYIFlWXtqHZlUKDZmlBKviEg94YseMZZlvQe854tziYiINCTaWUtERMRGSsQiIiI28snQtASPcztPcHrtYcoKSghtFsHVQzvQuEdru8MSEWmwlIgbkHM7T1DwzgEsRzkAZQUlFLxzAEDJWETEJkrEDcjptYfdSdjFcpRzeu1hnyZi9bpFRKpOibgBKSsoqVZ7TVTW6z52vpwtqw9y9lQJTZpH0HdUJzr3jvHZtUVEgpEScQMS2izCa9INbRbhs2tU1Os+8X87+LAknLIyA8DZUyVsWJwNoGQsIg2aVk03IFcP7YAJ9/yWm/AQrh7awWfXqKh3HRZ+lTsJu5Re6CGLiDRk6hE3IK552rqcv62o111U7uXJOHvGIiINmRJxA9O4R2ufL5wqTEvjxJy5lObkEBE/hEad74Xyb3q/VmkJB84WAY0vO7ZJc98Ni4uIBCMlYqmVwrQ0cp6djlVcDEDJp2spP3eWRl1GYiKaYRWdomTP37jKUU5Il+9QHvpN4g1rFELfUZ3sCl1EJCAoETdA+7fm+mz18ok5c91J2MXxn02U/3cP5cXF7sdiAMLD+SLxO5wrCtGqaRGRC5SIG5j9W3PZsDib0vPOSdvarl4uzcnx2l5WWEjb37zsHrIOi40leeI9DBpxR82DFxGph5SIG5gtqw+6k7CLa/VyTRJxWGwspcePe22PHjGC6BEjahyriEhDoNuXGpiKVinXdPVy60kTMZGRHm0mMpLWkybW6HwiIg2NesQNTJPmEV6Tbk1XL7t6vBcPQbeeNFE9YRGRKlIibmD6jurkMUcMtV+9rCFoEZGaUyKux7wVX3DNA2vPZxGRwKBEXEO+vAWoLlRWfKFz75iAitUXVu08xpIlq+h6NIOmZWcJv7o5Q8Y9zI39B9kdmohIpbRYqwZctwC55lpdtwDt35prc2TfqKzkYX2zaucx/rJwBT2+/ICry85igNLTp3j/tT+wN2OD3eGJiFRKibgGKrsFKFD4o+RhoJi1dh8peVsIt0o92q3S82QsXWRTVCIiVaNEXAO+vgWoLlRU2tCXJQ8DxfGCIpqWnfX62JmT+X6ORkSkepSIa6CiW30CqYCBP0oeBoq2zaI4E9rE62NNW7T0czQiItWjRFwDfUd1IqyR51sXaAUMGvdoTbN7b3D3gEObRdDs3ht8XnkpEEwZ2oXMVn1xGM+1hyasEf3HjrMpKhGRqtGq6RoIlluA6qLkYSAa3aMd8G2WLAnTqmkRCTrGsiy/XzQlJcXKzMz0+3VFRETsYIzZbllWirfHNDQtIiJiIyViERERGykRi4iI2EiLtURERIC9GRtY99afcRSc4VxkGQcT4YF7fsLwuOF1el31iIXCtDQO3J7K3htv4sDtqRSmpdkdkoiIX+3N2MD7f57H1+WNONcpEatDbzqd6sHKv77BmkNr6vTaSsQNXGFaGjnPTqf0+HGwLEqPHyfn2elKxiLSoGQsXURJ1NWUxF6P1SgCjMEKj+C68zey+MPFdXptJeIG7sScuVjFxR5tVnExJ+bMtScgEREbnMnP43yrdhAS6vlASCjtctrV6bWViBu40pycarWLiNRHRVdZWOGNvD52VdlVdXptJeIGLiw2tlrtIiL10b9vOIkpPe/1sYjGdVtHQIm4gWs9aSImMtKjzURG0nrSRHsCEhGxQVHnqzkUuRfKyzzaS00pMV361em1lYgbuOgRI4j91fOEtW0LxhDWti2xv3qe6BEj7A5NRMRvJiRPYEf7L9naOpNzoeewsDgX+jX/Cgnn9c/KrnyCWtB9xEL0iBFKvCLSoA2PG87EpTv5stVajlz7DyxHM0ryhlJ6ugeGojq9thKxiIgI0NjRi4KDPS5rj44Kr9PramhaREQEMKZ67b6iRCwiIgIUfO2oVruvKBGLiIgAbZtFVavdV5SIRUREgClDuxAV7rmzVlR4KFOGdqnT62qxloiICDC6RzuafpxOo4Wv0fzcfznV+BrOj3+c1B7a4lJERKTOFaal0e71ObQ8919CgJbn/ku71+fUeRGcWiViY8x9xpg9xphyY0yKr4ISERHxN7uK4NS2R/wpcC+w0QexiIiI2MauIji1SsSWZe21LGufr4IRERGxi11FcDRHLCIign1FcK64atoYsx6I8fLQNMuyVlf1QsaYx4DHAK677roqBygiIuIPrj33T8yZS2lODmGxsbSeNLHO9+I3lmXV/iTGfARMtiwrsyrPT0lJsTIzq/RUERGRoGeM2W5ZltdFzRqaFhERsVFtb1+6xxhzFOgLrDHGrPVNWBJMVuaeImXzHmI37CJl8x5W5p6yOyQRkaBRq521LMv6G/A3H8UiQWhl7ikm7ztCUblziuNoiYPJ+44AMCamuZ2hiYgEBQ1NS628dCjHnYRdisotXjpUt/fdiYjUF0rEUivHSryXB6uoXUREPCkRS620iwj32t6CU+TkVvnuNhGRBkuJWGplalwsUSHGo62RVcz91ptkZ09TMhYRuQIlYqmVMTHNmd3lWlpyCqxyWloneIQ/0Y+PKS8v4uOsnzJkxRDWHFpjd6giIgFJ9Yil1sbENKfZZ48Bl28Oc02oRc65HGZsngHA8LjhVTrn3owNZCxdxJmT+TRt0ZL+Y8dxY/9BPoxaRCQwqEcsPhEZ4X1T9P+WOYeti8uKmbdjXpXOtTdjA+vmv8KZ/DywLM7k57Fu/ivszdjgs3hFRAKFErH4RFynyYSERHm0nS+Hvxd+M+iSey63SufKWLqI0vMlHm2l50vIWLqo9oGKiAQYDU2LT8TGjALg0MHZFBUf579lhr8XhrHj629WVcc09lY75HJnTuZXq11EJJipRyw+Exszin79Mijp+Ht+c+IajyQcGRrJhOQJVTpP0xYtq9UuIhLMlIjF54bHDWfGLTOIbRyLwRDbOJYZt8yo8kKt/mPHEdYowqMtrFEE/ceOq4twRURspaFpqRPD44ZXOfFeyrU62terpgvT0vxeZ1RE5EqUiCUg3dh/kE9vVypMSyPn2elYxcUAlB4/Ts6z0wGUjEXEVhqalgbhxJy57iTsYhUXc2LOXHsCEhG5QIlYGoTSHO/VoCpqFxHxFyViaRDCYr1vOFJRu4iIvygRS4PQetJETGSkR5uJjKT1pIn2BCQicoEWa0mD4FqQpVXTIhJolIilwYgeMUKJV0QCjhKxBJVVO48xa+0+jhcU0bZZFFOGdmF0j3Z2hyUiUmNKxBI0Vu08xtR3dlPkKAPgWEERU9/ZDaBkLCJBS4u1JGjMWrvPnYRdihxlzFq7z6aIRERqT4lYgsbxgqJqtYuIBAMlYgkabZtFVatdRCQYKBFL0JgytAtR4aEebVHhoUwZ2sWmiEREak+LtSRouBZkadW0iNQnSsQSVEb3aKfEKyL1ioamRUREbKQesYiINHh2bhakRCwiIg2a3ZsFaWhaREQaNLs3C1IiFhGRBs3uzYKUiEVEpEGze7MgJWIREWnQ7N4sSIu1RESkQbN7syAlYhERafDs3CxIQ9MiIiI2UiIWERGxkRKxiIiIjZSIRa4kaznM6QYzmjn/zlpud0QiUo9osZZIZbKWQ9pT4LhwY3/hEefXAIn32xeXiPjMmkNrmLdjHrnncolpHMOE5AkMjxvut+urRyxSmfTnv0nCLo4iZ7uIBL01h9YwY/MMcs7lYGGRcy6HGZtnsObQGr/FoEQsUpnCo9VrF5GgMm/HPIrLij3aisuKmbdjnt9iUCIWqUx0e6/NK6+/j5TNe4jdsIuUzXtYmXvKz4GJiC/knsutVntdUCIWqUzqdAj33G92ZeydTO7wPxwtcWABR0scTN53RMlYJAjFNI6pVntdUCIWqUzi/TDi9xB9LWAg+lpe6vpTii75r1NUbvHSoRx7YhSRGpuQPIHI0EiPtsjQSCYkT/BbDLVaNW2MmQWMAM4DB4GHLcsq8EFcIoEj8X6PFdLHNuzy+rRjJQ4/BSQivuJaHW3nquna3r70ATDVsqxSY8zLwFTg6dqHJRK42kWEc9RL0m0XEW5DNCJSW8Pjhvs18V6qVkPTlmWtsyyr9MKX/wK8r2wRqUemxsUSFWI82qJCDFPjYm2KSESCmS/niH8AvF/Rg8aYx4wxmcaYzLy8PB9eVsS/xsQ0Z3aXa2kfEY4B2keEM7vLtYyJaW53aCIShIxlWZU/wZj1gLflY9Msy1p94TnTgBTgXutKJwRSUlKszMzMGoQrIiISfIwx2y3LSvH22BXniC3LuuMKJx8P3A2kViUJi4iIyDdqu2p6GPAz4DbLsr72TUgiIiINR23niF8BmgIfGGN2GWNe80FMIiIiDUatesSWZX3LV4GIiIg0RNpZS0RExEaqRyziRytzT/HSoRyOlThoFxHO1LhY3fYk0sApEYv4ycrcU0zed4SicufNBa5iEYCSsUgDpqFpET956VCOOwm7qFiEiCgRi/hJRUUhVCxCpGFTIhbxk4qKQqhYhEjDpkQs4icqFiEi3mixloifuBZkadW0iFxMiVjEj8bENFfiFREPGpoWERGxkRKxiIiIjZSIRUREbKRELCIiYiMlYhERERspEYuIiNhIiVhERMRGSsQiIiI2UiIWERGxkRKxiIiIjZSIRUREbKRELCIiYiMlYhG5TGFaGgduT2XvjTdx4PZUCtPS7A5JpN5S9SUR8VCYlkbOs9OxiosBKD1+nJxnpwMQPWKEnaGJ1EvqEYuIhxNz5rqTsItVXMyJOXPtCUiknlOPWKSOrdp5jFlr93G8oIi2zaKYMrQLo3u0szusCpXm5FSrXURqRz1ikTq0aucxpr6zm2MFRVjAsYIipr6zm1U7j9kdWoXCYmOr1S4itaNELFKHZq3dR5GjzKOtyFHGrLX7bIroylpPmoiJjPRoM5GRtJ400Z6AROo5DU2L1KHjBUXVag8ErgVZJ+bMpTQnh7DYWFpPmqiFWiJ1RIlYpA61bRbFMS9Jt22zKBuiqbroESOUeEX8RIlYpA5NGdqFqe/s9hiejgoPZcrQLjZG5d3+rblsWX2Qs6dKaNI8gr6jOtG5d4zdYYnUe0rEInXItTo60FdN79+ay4bF2ZSeLwfg7KkSNizOBlAyFqljSsQidWx0j3YBl3gvtWX1QXcSdik9X86W1QeViEXqmBKxiE3O7TzB6bWHKSsoIbRZBFcP7UDjHq1tieXsqZJqtYuI7+j2JREbnNt5goJ3DlBW4Ex0ZQUlFLxzgHM7T9gST5PmEdVqFxHfCZgescPh4OjRoxRfsrWeNByRkZG0b9+e8PBwu0OpM65dtuYVhBJzyedgy1HO6bWHbekV9x3VyWOOGCCsUQh9R3XyeywiDU3AJOKjR4/StGlTOnTogDHG7nDEzyzL4uTJkxw9epSOHTvaHU6dcO2yVeQoozVNvT7H1UP2N9c8sFZNi/hfwCTi4uJiJeEGzBhDixYtyMvLszuUOnPxLlsnsIjh8p/10Gb2DQV37h2jxCtig4CaI1YSbtjq+/f/eEERYVfvpHGnmSyMXUixOe/xuAkP4eqhHewJTkRsEzA9YpH6rmXMHoqi38GEOPhno0yMgfEnRtOq9BrCm0XaumpaROwTUD3i6li18xj9Zn5Ix2fW0G/mhwFdzeZSq1atwhhDdnb2FZ971113UVBQUOHjL774IvHx8SQmJpKUlMTWrVu9Pm/69OmsX7++piGLD0S0XosJcbi//ig6k/E3/ILvJD5H7DO9lIRFGqig7BFfvOgFviktBwTUxgmlpaWEhV3+Fi9ZsoRbb72VJUuW8Nxzz1V6jvfee6/Cx7Zs2cLf//53duzYQUREBPn5+Zw/f/6y55WVlfH8889X/wWIT512eJ//rqhdRBqGoOwR10VpucOHD9OtWzf317Nnz2bGjBkA/P73v+emm24iMTGRsWPHAnDu3Dl+8IMf0KtXL3r06MHq1asBWLhwISNHjuT2228nNTX1suucPXuWjz/+mNdff52lS5cC8I9//IP77rvP/ZyPPvqIu+++G4AOHTqQn5/vNeacnBxatmxJRIRzgU/Lli1p27at+7inn36a5ORk3n77bcaPH8+KFStq/P5I7cU09r4QqqJ2EWkYgjIR+7u03MyZM9m5cydZWVm89tprgHNI+Pbbb2fbtm1s2LCBKVOmcO7cOQB27NjBihUr+Oc//3nZuVavXs2wYcPo3LkzLVq0YPv27dxxxx1s3brVffyyZcvcCb8yQ4YM4ciRI3Tu3Jknnnjisuu1aNGCHTt2VOlcUvcmJE8gMtSzzm9kaCQTkifYFJGIBIKgTMQVlZCrq9JyiYmJPPTQQ7z11lvuoeZ169Yxc+ZMkpKSGDhwIMXFxXz55ZcADB48mObNm3s915IlS9yJcezYsSxZsoSwsDCGDRtGWloapaWlrFmzhlGjRl0xriZNmrB9+3bmz59Pq1ateOCBB1i4cKH78QceeKCWr1x8aXjccGbcMoPYxrEYDLGNY5lxywyGxw23OzQRsVFQzhHXRWm5sLAwysu/2VXo4h2+1qxZw8aNG0lLS+PFF19k9+7dWJbFypUr6dLF85pbt26lcePGXq9x6tQpPvzwQ3bv3o0xhrKyMowxzJo1i7Fjx/LKK6/QvHlzUlJSaNrU+4YPlwoNDWXgwIEMHDiQhIQE3nzzTcaPHw9QYRxin+Fxw5V4RcRDrXrExphfGWOyjDG7jDHrjDFtfRVYZUb3aMdL9ybQrlkUBmjXLIqX7k2o1UKtNm3acOLECU6ePElJSQl///vfASgvL+fIkSMMGjSIl19+mcLCQs6ePcvQoUP5wx/+gGVZAOzcufOK11ixYgXf+973+OKLLzh8+DBHjhyhY8eOZGRkcNttt7Fjxw7+8pe/VHkoed++fRw4cMD99a5du7j++utr8OpFRMQute0Rz7Is61kAY8xTwHTg8VpHVQW+Li0XHh7O9OnT6dWrF+3ataNr166Ac8Xxd7/7XQoLC7Esi6eeeopmzZrx7LPPMnHiRBITEykvL6djx47u5F2RJUuW8PTTT3u0jRkzhiVLljBgwADuvvtuFi5cyJtvvlmlmM+ePctPfvITCgoKCAsL41vf+hbz58+v2RsgIiK2MK4eXa1PZMxU4DrLsv7nSs9NSUmxMjMzPdr27t3LjTfe6JNYJHjp50BE6iNjzHbLslK8PVbrOWJjzIvAOKAQGFTb84mIiDQkV0zExpj1gLcbHadZlrXasqxpwLQLPeIngV9WcJ7HgMcArrvuuppH3ACdPHnS6z3J6enptGjRwoaIxA5ZWVmkp6dTWFhIdHQ0qampJCYm2h2WiNTSFROxZVl3VPFci4H3qCARW5Y1H5gPzqHpqgYozvuBd+3aZXcYYqOsrCzS0tJwOJxbZBYWFpKWlgagZCwS5Gq7avqGi74cBVx582QRqbb09HR3EnZxOBykp6fbFJGI+Ept54hnGmO6AOXAF/hpxbRIQ1NYWFitdhEJHrVKxJZljfFVICJSsejoaK9JNzo62oZoRMSXgnKLy2AWGhpKUlIS3bt3Jzk5mc2bN9f5tVx/Dh8+XGfXkrqVmppKeHi4R1t4eLjXRXwiElyCcotLALKWQ/rzUHgUottD6nRIvN/uqDx4K4MYFRXlXni1du1apk6d6rU4hDeWZWFZFiEhIV6/vtTF15Lg5lqQ5Vo1ff31uXTouIu8/DfYtCmWuE6TiY258v7kIhJ4grNHnLUc0p6CwiOA5fw77Slnew35qwzixU6fPs0111wDOHfJSk1NJTk5mYSEBPf5Dh8+TJcuXRg3bhzdunUjIyPD4+tf/epXTJw40X3Ov/zlL0yaNMnr9Sq7RteuXRk/fjydO3fmoYceYv369fTr148bbriBbdu2Vf8NFZ9LTExk0qRJ/OjxHnTo+DHl5XmARXHJcbKzp5GTu9ruEEWkBoKzR5z+PDguKXnoKHK210GveObMmfznP/8hIiKCgoIC4JsyiAsWLKCgoIBevXpxxx3OO7127NhBVlaW1wpMRUVFJCUlUVxcTE5ODh9++CEAkZGR/O1vf+Pqq68mPz+fPn36MHLkSAAOHDjAm2++SZ8+fTh8+LDH12fPnqV79+7MmjWL8PBw3njjDf785z97XAugY8eOvP322xVe4/PPP+ftt99mwYIF9OzZk7/+9a98/PHHvPvuu/z6179m1apVPn9fpWYOHZxNebnnz395eRGHDs5Wr1gkCAVnIi48Wr32WnKVQRw9ejSjR48GnGUQ3333XWbPng1Q5TKIFw8Xb9myhXHjxvHpp59iWRY///nP2bhxIyEhIRw7doyvvvoKgOuvv54+ffq4z3Hx102aNOH222/n73//OzfeeCMOh4OEhITLrgXO210qukbHjh3dx8XHx5OamooxhoSEBM0tB5jikpxqtYtIYAvORBzd/sKwtJf2GvJHGcRL9e3bl/z8fPLy8njvvffIy8tj+/bthIeH06FDB3cMl57v0q8feeQRfv3rX9O1a1cefvjhCq+3ePHiCq8RERHhfl5ISIj765CQEEpLS6v0esQ/IiNiKS457rVdRIJPcM4Rp06H8CjPtvAoZ3sN+aMM4qWys7MpKyujRYsWFBYW0rp1a8LDw9mwYQNffPFFlc/Tu3dvjhw5wl//+lcefPDBCp9Xm2tI4IjrNJmQEM+f/5CQKOI6TbYpIhGpjeDsEbvmgX24atofZRDBc97WsizefPNNQkNDeeihhxgxYgQJCQmkpKS4r19V999/P7t27XIv/vKmtteQwOCaBz50cDbFJTlERmjVtEgw81kZxOpQGUTfu/vuu5k0aVLQ31eqn4P6Z82hNczbMY/cc7nENI5hQvIEhscNtzssEb+qrAxicA5Ni1tBQQGdO3cmKioq6JOw1D9rDq1hxuYZ5JzLwcIi51wOMzbPYM2hNXaHJhIwgnNoWtyaNWvG/v377Q5DxKt5O+ZRXFbs0VZcVsy8HfPUKxa5QD1iEakzuedyq9Uu4jNZy2FON5jRzPl3LTZ8qmtKxCJSZ2Iax1SrXcQnspbDqic8d19c9UTAJmMlYhGpMxOSJxAZGunRFhkayYTkCTZFJA3C+09DuWf9bsodzvYApDlikXpi/9Zctqw+yNlTJTRpHkHfUZ3o3NvenqdrHlirpsWvik5Vr91m6hH7WW5uLmPHjqVTp07cfPPN3HXXXezfv5+PPvqIu+++2yfXaNKkSY2Pfe2111i0aFG1jysoKODVV1+t8XWldvZvzWXD4mzOnioB4OypEjYszmb/VvvnYofHDWfdt9eR9f0s1n17nZKwyCWCNhGvObSGISuGkPhmIkNWDAnI2yEu3RrSsizuueceBg4cyMGDB9m+fTsvvfSSe7/nQPD4448zbty4ah9Xk0RsWZbHtqJSc1tWH6T0vOd7WXq+nC2rD9oUkYiNorzv9V9hu82CMhHXxb2J/iiDuGHDBsLDw3n88cfdbd27d6d///6As0zht7/9bbp27cpDDz3k3j6zQ4cO5OfnA5CZmcnAgQPdz3/44YdJSEggMTGRlStXelwvPz+fvn37smbNGj766CNuu+02Ro0aRVxcHM888wyLFy+mV69eJCQkcPCg8xf2jBkz3IUsBg4cyNNPP02vXr3o3LkzGRkZAOzZs4devXqRlJREYmIiBw4c4JlnnuHgwYMkJSUxZcoUAGbNmkXPnj1JTEzkl7/8pft9vriM45EjXvYMl2pz9YSr2i5Sr935MoQ28mwLbeRsD0BBOUfs73sTfVUG8dNPP+Xmm2+u8Do7d+5kz549tG3bln79+rFp0yZuvfXWCp//q1/9iujoaHbv3g3Af//7X/djX331FSNHjuSFF15g8ODBfPTRR3zyySfs3buX5s2bExcXxyOPPMK2bduYN28ef/jDH5g7d+5l1ygtLWXbtm289957PPfcc6xfv57XXnuNCRMm8NBDD3H+/HnKysqYOXMmn376qbva07p16zhw4ADbtm3DsixGjhzJxo0bue666zzKOIpvNGke4TXpNmke4eXZIvVcHWyDXJeCMhH7+95EX5ZBrEyvXr1o395ZQSopKYnDhw9XmojXr1/P0qVL3V+79pl2OBykpqbyxz/+kdtuu839eM+ePYmNdVbo6dSpE0OGDAEgISGBDRs2eL3GvffeC8DNN9/sLofYt29fXnzxRY4ePcq9997LDTfccNlx69atY926dfTo0QNw9t4PHDjAddddd1lZR6m9vqM6sWFxtsfwdFijEPqO6mRjVCI2Srw/YBPvpYJyaLou7k28UhnEH//4x+zYsYOePXtSWlrqLoO4a9cudu3axZdffuneI7miMojx8fFs3769whguLkUYGhrqnmO+OLaL46rstdx8882sXbu2wvNXtdSh6zkXx/Od73yHd999l6ioKO666y4+/PDDy46zLIupU6e635/PP/+cH/7wh0DF74/UXOfeMQx6qKu7B9ykeQSDHupq+6ppEbmyoEzEdXFvoj/KIN5+++2UlJQwf/58d1tWVpZ77rUiHTp0cCfwi+eBBw8ezB//+Ef3166haWMMCxYsIDs7m5df9v2cyKFDh4iLi+Opp55i1KhRZGVl0bRpU86cOeN+ztChQ1mwYAFnz54F4NixY5w4ccLnscg3OveO4fu/7sePX7ud7/+6n5KwSJAIykQ8PG44M26ZQWzjWAyG2MaxzLhlRq3mhy8ugzh48ODLyiAmJCTQo0cPjzKIDoeDxMRE4uPjefbZZ694DWMMf/vb31i/fj2dOnUiPj6eqVOnEhNT+S/MX/7yl0yYMIGUlBRCQ0Pd7b/4xS/473//S7du3ejevbvH8HJoaChLlizhww8/9PltRcuXL6dbt24kJSXx6aefMm7cOFq0aEG/fv3o1q0bU6ZMYciQIXznO9+hb9++JCQk8O1vf9sjUYuIiJPKIEpA0c+BiNRHKoMoIiISoIJy1bSICMCqnceYtXYfxwuKaNssiilDuzC6Rzu7wxKpFiViEQlKq3YeY+o7uylylAFwrKCIqe8476lXMpZgoqFpEQlKs9bucydhlyJHGbPW7rMpIpGaUSIWkaB0vKCoWu0igUqJWETqTtZymNMNZjRz/u3Dwuxtm0VVq10kUCkR+1lFZRArM2XKFOLj45kyZQp5eXn07t2bHj16XHEjEG/Gjx9Px44dSUpKIjk5mS1btrjbV6xYUaPXJOJV1nJIewoKjwCW8++0p3yWjKcM7UJUeKhHW1R4KFOGdvHJ+UX8JWgXaxWmpXFizlxKc3IIi42l9aSJRI8YYXdYHkpLSwkL++YtdpVB/P73v+/eI/qTTz7hq6++onPnzhWeZ/78+Zw6dYrQ0FCWLl1KQkIC/+///b8axzVr1iy+/e1vs27dOn70ox+RlZVV43OJVCj9eXBcMkzsKHK2+2APYNeCLK2almAXlD3iwrQ0cp6dTunx42BZlB4/Ts6z0ylMS6vxOe0ug2hZFlOmTKFbt24kJCSwbNkyAEaOHMnZs2e5+eabefnll/nZz37G6tWrSUpKoqioiHXr1tG3b1+Sk5O577773FtKbt++ndtuu42bb76ZoUOHkpOTc9lrHjBgAJ9//vll7RUd+5e//IWePXvSvXt3xowZw9dffw3A22+/7d7da8CAAYBzR7IpU6a4yyD++c9/rv43RYJb4dHqtdfA6B7t2PTM7fxn5nA2PXO7krAEpaDsEZ+YMxfrkuIHVnExJ+bMrZNesT/KIL7zzjvs2rWLTz75hPz8fHr27MmAAQN49913adKkibu8YJs2bcjMzOSVV14hPz+fF154gfXr19O4cWNefvllfve73zF16lR+8pOfsHr1alq1asWyZcuYNm0aCxYs8LhmWloaCQkJHm0Oh6PCY++9914effRRwLm95uuvv85PfvITnn/+edauXUu7du3c78/rr79OdHQ0//73vykpKaFfv34MGTKEjh071uZbIcEkuv2FYWkv7SLiFpSJuNRL766y9tryRxnEjz/+mAcffJDQ0FDatGnDbbfdxr///W9GjhxZ4TH/+te/+Oyzz+jXrx8A58+fp2/fvuzbt49PP/2UwYMHA87eqav8ITjnnF944QVatWrF66+/7nHOyo799NNP+cUvfkFBQYG78AVAv379GD9+PPfff7+7bOK6devIyspyzzsXFhZy4MABJeKGJHW6c0744uHp8Chnu4i4BWUiDouNdQ5Le2mv8TmvUAZx48aNpKWl8eKLL7J79253GcQuXTwXhmzdurXSMoi+XBBlWRaDBw9myZIlHu27d+8mPj7evRDrUq454orOWdGx48ePZ9WqVXTv3p2FCxfy0UcfAfDaa6+xdetW1qxZw80338z27duxLIs//OEP7mQtgWNvxgYyli7izMl8mrZoSf+x47ix/yDfXyjIirOL2CUo54hbT5qIifQsg2giI2k9aWKNz2l3GcT+/fuzbNkyysrKyMvLY+PGjfTq1avS8/Xp04dNmza553nPnTvH/v376dKlC3l5ee5k6nA42LNnT5Xeh8qOPXPmDLGxsTgcDhYvXuw+5uDBg/Tu3Zvnn3+eVq1aceTIEYYOHcqf/vQnHA4HAPv37+fcuXNVikHqzt6MDayb/wpn8vPAsjiTn8e6+a+wN2NDhceszD1FyuY9xG7YRcrmPazMPVX1CybeD5M+hRkFzr+VhEUuE5Q9Ytc8sC9XTV9cBrFdu3aXlUEsLCzEsiyPMogTJ04kMTGR8vJyOnbs6E7eFXGVQZw4cSIvv/wykZGRdOjQgblz53LrrbeyZcsWunfvjjGG3/zmN1csj9iqVSsWLlzIgw8+SElJCQAvvPACnTt3ZsWKFTz11FMUFhZSWlrKxIkTiY+Pv+L70KhRowqP/dWvfkXv3r1p1aoVvXv3dpc1nDJlCgcOHMCyLFJTU+nevTuJiYkcPnyY5ORkLMuiVatWrFq1qgrfCalLGUsXUXq+xKOt9HwJGUsXee0Vr8w9xeR9Rygqd37gPFriYPI+57zvmJjqTb+IiHcqgygBRT8Hdeu3Y0eAt//zxvDTpZffdZCyeQ9HSxyXtbePCCfzlit/sBMRJ5VBFBEAmrZoWa32Y16ScGXtIlJ9QTk0LSI103/sONbNf8VjeDqsUQT9x47z+vx2EeFee8TtIsLrLEaRqsrKyiI9PZ3CwkKio6NJTU0lMTHR7rCqTT1ikQbkxv6DGPLYkzRt2QqMoWnLVgx57MkKV01PjYslKsR4tEWFGKbG1fwOBRFfyMrKYvWqdyksLASct0iuXvVuUO4UqB6xSANzY/9BVb5dybUg66VDORwrcdAuIpypcbE+Xai15tAa5u2YR+65XGIaxzAheQLD44b77PxSP/3jvXWUlZd6tJWVl/KP99YFXa9YiVhEKjUmpnmdrZBec2gNMzbPoLjMed9+zrkcZmyeAaBkLJX6uugsmArag4wSsYhcUV3Nxc3bMc+dhF2Ky4qZt2OeErFUKqQsgvKwEq/twcYnc8TGmJ8aYyxjjPell+JWkzKIVXXXXXe593oW8ZWsrCzS0tI85uLS0tJ8MheXey63Wu0iLs25AcovSWHlIc72IFPrHrEx5lpgCPBl7cOpuv1bc9my+iBnT5XQpHkEfUd1onPvyjfA8DdflUG8EsuysCyL9957r9Yxi1wqPT3dvUOai8PhID09vda94pjGMeScu3yP+JjGgfV/WQLPkBG3subtMs5EHqI8tISQsgiaFscx5L5b7Q6t2nzRI54D/Azw284g+7fmsmFxNmdPOYclzp4qYcPibPZvrfmnaLvLII4bN85j56mHHnqI1atXs3DhQkaNGsXAgQO54YYbeO6559zxdunShXHjxtGtWzeOHDlChw4dyM/PB+Ctt96iV69eJCUl8aMf/YiysjIAmjRpwrRp0+jevTt9+vThq6++qvF7Jg2Dqydc1fbqmJA8gchQz+1qI0MjmZA8odbn9mb/1lze/Pkm/vj4h7z58021+p0h9urcO4bh993G9eUDaPXVAK4vH8Dw+24LuA5ZVdQqERtjRgHHLMv6xEfxVMmW1QcpPV/u0VZ6vpwtqw/WyfVmzpzJzp07ycrK4rXXXgO+KYO4bds2NmzYwJQpU9x7Ke/YsYMVK1bwz3/+0+M8lZVB/OEPf8jChQsB5y+4zZs3M3y4c45s27ZtrFy5kqysLN5++21cu5IdOHCAJ554gj179nD99de7z7V3716WLVvGpk2b2LVrF6Ghoe69oc+dO0efPn345JNPGDBgAH/5y19890ZJvRQdHV2t9uoYHjecGbfMILZxLAZDbONYZtwyo07mh+viA7zYq3PvGL7/6378+LXb+f6v+wVlEoYqDE0bY9YD3l7dNODnOIelr8gY8xjwGMB1111XjRAv5/qPVNX22vJHGcTbbruNJ554gry8PFauXMmYMWPcw9qDBw+mRYsWANx77718/PHHjB49muuvv54+ffpcdq709HS2b99Oz549ASgqKqJ169aAcy/pu+++G4Cbb76ZDz74oJrvhjQ0qamppKWleQxPh4eHXzbiU1PD44b7ZWFWZR/gg/UXuNQPV0zElmXd4a3dGJMAdAQ+McYAtAd2GGN6WZZ12UdMy7LmA/PBudd0bYJu0jzCa9Jt0rzmq+UCoQziuHHjeOutt1i6dClvvPGGu/3C+3vZ1xVdx7Isvv/97/PSSy9d9lh4eLj7+NDQUEpLSy97jsjFXPPAwb6Dkb8/wItUVY2Hpi3L2m1ZVmvLsjpYltUBOAoke0vCvtZ3VCfCGnmGHtYohL6jOtX4nHaXQQRnvd+5c+cCcNNNN7mf88EHH3Dq1CmKiopYtWoV/fr1q/Q6qamprFixghMnTgBw6tQpvvjii6q/GSKXSExMZNKkScyYMYNJkyYFXRKGij+o1+YDvIgvBOUWl517xzDooa7u/0BNmkcw6KGutRpeurgM4uDBgy8rg5iQkECPHj08yiA6HA4SExOJj4/n2WefveI1XGUQ169fT6dOnYiPj2fq1Knucodt2rThxhtv5OGHH/Y4rlevXowZM4bExETGjBlDSorXAh5uN910Ey+88AJDhgwhMTGRwYMHk5Nz+cpUkYakLj7Ai/iCyiAGkK+//pqEhAR27NjhXgizcOFCMjMzeeWVV2yOzj/0cyB1KRhue5T6qbIyiNpZK0CsX7+eH/7wh0yaNMknq1FF5HKde8co8QaxvRkbyFi6iDMn82naoiX9x46r8r7pgUyJOEDccccdXudxx48fz/jx4/0fkIhIANmbscGjhOeZ/DzWzXeOFAZ7Mg7KOWIREWlYMpYu8qijDVB6voSMpYtsish3lIhFRCTgnTmZX632YKKhaREJaOd2nuD02sOUFZQQ2iyCq4d2oHGP1naHJX7WtEVLzuTneW0PduoRi0jAOrfzBAXvHKCswDkkWVZQQsE7Bzi384TNkYm/9R87jrBGnvd8hzWKoP/YcTZF5DtKxH4WGhpKUlIS8fHxdO/end/+9rceO3pVR2ZmJk899ZTXxy4uAOG6puvPzJkzL3v+Rx995N76UiRQnF57GMvh+f/DcpRzeu1hewIS29zYfxBDHnuSpi1bgTE0bdmKIY89GfQLtSCIh6aDYRn7pWUQAaKioti1axcAJ06c4Dvf+Q6nT592V1WqjpSUlCtu7nHpNUWCiasnXNV2qd9u7D8o4H7P+0JQ9ohdy9jP5OeBZbmXse/N2FDjc/qjDOKlWrduzfz583nllVewLIvDhw/Tv39/kpOTSU5OZvPmzQCMHTuWNWvWuI8bP348K1as8OjFnjx5kiFDhhAfH88jjzxCVTZq+cc//kHXrl1JTk7mnXfecbdX9NoGDBjgkdBvvfVWPvnEr4W3pIEJbeZ9+8mK2qV+yMldzaZN/Un/8Fts2tSfnNzVdodUp4IyEft7GbuvyiB6ExcXR1lZGSdOnKB169Z88MEH7Nixg2XLlrmHnR944AGWL18OwPnz50lPT3eXSHR57rnnuPXWW9mzZw/33HOPuxIUOKsvXTw0vWzZMoqLi3n00UdJS0tj+/bt5OZ+s0V4Ra/t4lKN+/fvp7i4mO7du9f8jRW5gquHdsCEe/6aMuEhXD20gz0BSZ3LyV1NdvY0ikuOAxbFJcfJzp5Wr5NxUCZify9jd5VBfOutt9xDzevWrWPmzJkkJSUxcODAWpdBBHA4HDz66KMkJCRw33338dlnnwFw5513smHDBkpKSnj//fcZMGAAUVFRHsdu3LiR7373uwAMHz6ca665xv2Ya2ja9eeBBx4gOzubjh07csMNN2CMcR9b2Wu77777+Pvf/47D4WDBggXaaETqXOMerWl27w3uHnBoswia3XuDVk3XY4cOzqa8vMijrby8iEMHZ9sUUd0LyjniuljG7o8yiN4cOnSI0NBQWrduzXPPPUebNm345JNPKC8vJzIyEoDIyEgGDhzI2rVrWbZsmXt4vK5U9NrA+SFj9erVLF++nO3bt9dpHCLgTMZKvA1HcYn3AjUVtdcHQdkjrotl7P4og3ipvLw8Hn/8cZ588kmMMRQWFhIbG0tISAj/93//R1lZmfu5DzzwAG+88QYZGRkMGzbssnMNGDCAv/71rwC8//77/Pe//6302l27duXw4cMcPHgQgCVLlrgfq+y1PfLIIzz11FP07NnTo9ctIuILkRGx1WqvD4KyR+xaNefLVdMXl0Fs167dZWUQCwsLsSzLowzixIkTSUxMpLy8nI4dO7qTd2Vc87UOh4OwsDC+973v8b//+78APPHEE4wZM4ZFixYxbNgwj571kCFD+N73vseoUaNo1KjRZef95S9/yYMPPkh8fDy33HIL11133WXXdBk2bBgzZ85k/vz5DB8+nKuuuor+/ftz5swZgEpf280338zVV199WalGEZGaKExL48ScuZTm5BAWG0vs5Nv54qqVHsPTISFRxHWabGOUdUtlEKVajh8/zsCBA8nOziYkxPcDKvo5EGk4CtPSyHl2OtZFU4EmMpJGvx5JTvSHFJfkEBkRS1ynycTGjLIx0tpTGUTxiUWLFjFt2jR+97vf1UkSFpGG5cScuR5JGMAqLqZs9sf0+zDDpqj8T4lYqmzcuHGMGxf828mJSGAozfG+AKui9vpKiVhE6r2c3NUcOji7Xg111gdhsbGUHj/utb0hUSIWkXrNtUGEa/GPa4MIQMm4jl1pK+LWkyZ6nSNuPWmiDdHaRxN9IlJvndt5gv07f93gNogIBFXZijh6xAhif/U8YW3bgjGEtW1L7K+eJ3rECBsj9z/1iEWkXnKVUCwd6H3Hvfq8QUQgqGwr4ot7xdEjRjS4xHsp9Yj9rDplEA8fPowxhl/84hfutvz8fMLDw3nyySf9FbJIUHKVUAwrbuH18fq8QUQguNJWxA2tsENlgjYRn9t5gpyZ2zj6TAY5M7cFZKHw0tLSy9pc+z7v2bOHDz74gPfff99rCUTXsR07dvSovPT2228THx9fZ/GJ1BeuUoktD4zBlHluglPfN4gIBBVtOdy0RcsGWdihMkGZiF1DTq7/aGUFJRS8c6BWyTgQyiB6O/aqq67ixhtvxLUByrJly7j//vvd58jLy2PMmDH07NmTnj17smnTpmrFl5OTw4ABA0hKSqJbt25kZDSce/ekfnMViojOvYU2e8YTVtQCLAgraUnXri9qoVYdq2wr4oZY2KEyQTlH7BpyupjlKOf02sN1sjn8zJkz+c9//kNERAQFBQXAN6UCFyxYQEFBAb169eKOO+4AnGUQs7KyqlSB6eIyiJcee/jwYcBZj3jp0qW0adOG0NBQ2rZty/ELS/4nTJjApEmTuPXWW/nyyy8ZOnQoe/furXJ8v/3tbxk6dCjTpk2jrKyMr7/+2sfvnog9rh7agYJ3DmA5yonOvYXo3Fsw4SHO6k0xKiJR1yrbijj9w4ZX2KEyQZmIXT3hqrbXlqsM4ujRoxk9ejTgLBX47rvvMnu28xOcL8ogVnTssGHDePbZZ2nTpg0PPPCAx2Pr1693l0sEOH36NGfPnq1yfD179uQHP/gBDoeD0aNHe+xJLeJr+7fmsmX1Qc6eKqFJ8wj6jupE594xdXIt14fy02sPU1ZQQmizCK4e2kGVnPzoxv6DvNYAiIyIvTAsfXl7QxSUiTi0WYTXpOsaiqqJQCiDCHg9tlGjRtx888389re/5bPPPuPdd991P1ZeXs6//vUvd8lEl6rGN2DAADZu3MiaNWsYP348//u//6vds6RO7N+ay4bF2ZSed/4/O3uqhA2LswHqNBkr8QaeuE6TPe7thoY9bx+Uc8RXD+2ACfcM3YSHcPXQDjU+ZyCUQazMT3/6U15++eXLestDhgzhD3/4g/vrXbt2AZWXMrzYF198QZs2bXj00Ud55JFH2LFjR7Vfh0hVbFl90J2EXUrPl7Nl9UGbIqqCrOUwpxvMaOb8O2u53RHVC7Exo+ja9UUiI9oChsiItg163j4oe8R1MeQUCGUQKxMfH+91tfTvf/97fvzjH5OYmEhpaSkDBgzgtddeq3J8H330EbNmzSI8PJwmTZqwaNGiKrxbItV39pT3qaOK2m2XtRzSngLHhV5b4RHn1wCJ91d8nFRJbMyoBpt4L6UyiBJQ9HNQf735801ek26T5hF8/9f9bIjoCuZ0cybfS0VfC5M+9X88ASArK4v09HQKCwuJjo4mNTWVxMREu8MKCpWVQQzKoWkRCT59R3UirJHnr5ywRiH0HdXJpoiuoPBoBe1HGuQwdVZWFmlpaRQWFgJQWFhIWloaWVlZNkcW/JSIRcQvOveOYdBDXWnS3LmosknzCAY91LXOFmrVWnT7ih9zDVM3oGScnp6Ow+HwaHM4HKSnp9sUUf0RlHPEIhKcOveOCdzEe6nU6Z5zxJdyFEH68w1mvtjVE65qu1SdErGIiDeuBJv+vPe5Yqh4+NqHAmVeNjo62mvSjY6O9nss9Y2GpkUkoNm6r3zi/c6FWdHXen+8suFrHwikednU1FTCw8M92sLDw6+4la9cmXrEIhJwVuae4qVDORwrcdCmqJwfR5ZxJ9/sKw/4d6MOb8PU4VHO9jpU2bxsp7IYv+4a5uqFB0LvvL5Rj9jPXGUQXX9c+0lfqqCggFdffbVK52zSpAkAx48f59vf/jbg3Njjvffeu+Kxv/vd79wFLVJTU/niiy88Hj99+jTt27f3KLs4bdo0rr32Wvd1q3oukapYmXuKyfuOcLTEgQXkRoXwYrdI3o9x9htc+8r7VeL9MOL3F3rGxvn3iN/X+fxwZfOyvi58UxWJiYlMmjSJGTNmMGnSpMuS8MrcU6Rs3kPshl2kbN7DytxTdRpPfRG0PeJAmTepTGlpKWFhnm+xqwzilbgS8RNPPFHl67Vt25YVK1YAzkScmZnJXXfdVekxPXr0IDMzk6uuuoo//elP/OxnP2PZsmXux5999lkGDBjgccyIESN48sknueGGG6p1LpGqeOlQDkXlnvsbFIca/tg5gjtznaU762pf+Uol3u/3hVkVzcs2IdKvhW+qwvUByvW9O1riYPI+59z6mJia7b3fUARlj7gu5k3sKIMIcPbsWVJTU0lOTiYhIcF9nmeeeYaDBw+SlJTElClTKnyet9dw/vx5pk+fzrJly0hKSmLZsmXccMMN5OXlAc5tO7/1rW+Rl5fHoEGDuOqqqwDo06cPR49+s/hk+/btfPXVVwwZMsTjOn369CE29vLN2Ss7l0hVHStxeG3/KvKbbWBrs698MKloXjblfJxH2/sxYdw9oDHJvcJt64l6+wBVVG7x0qGGWVGpOoKyR1zZvEld9Ip9WQbRtcUlQMeOHXn77bf529/+xtVXX01+fj59+vRh5MiRzJw5k08//dTdey4tLfX6PG97VDdq1Ijnn3+ezMxMXnnlFQCys7NZvHgxEydOZP369XTv3p1WrVp5HPf6669z5513As5k/dOf/pS33nqL9evXV/s9u/hcItXRLiKco16ScZti5y/52u4rH0wqmpdt9V6xe1Tg/ZgwXuwWSXGo83eBXT3Rij5AVdQu3wjKROzv+9l8WQbx0qFph8PBz3/+czZu3EhISAjHjh3jq6++uuw4y7K8Pi8mpmr3ZP7gBz9g1KhRTJw4kQULFvDwww97PP7WW2+RmZnJP//5TwBeffVV7rrrLtq3r/6q0EvPJVIdU+NiPYY4ASLLLH68v2GWMkxMTLysg3Gu7IS71vIfO0e4k7CLqyfqz0Rc0QeodhHhXp4tFwvKRFwX97PZVQZx8eLF5OXlsX37dsLDw+nQoYPHtav7vIpce+21tGnThg8//JBt27axePFi92Pr16/nxRdf5J///CcREc4hvy1btpCRkcGrr77K2bNnOX/+PE2aNGHmzJmVXsfbuUSqw5U8XKum20WEMzUuljF3aJ7R5eLCNxcP2V/M3z1Rbx+gokIMU+MaZo3h6gjKOeK6uJ/NjjKI4OzFt27dmvDwcDZs2OBeady0aVPOnDlzxedV5NLjAR555BG++93vct999xEaGuqO+0c/+hHvvvuuuyYyOBP/l19+yeHDh5k9ezbjxo27YhKu6Fwi1TUmpjmZt8STMyiJzFvitdjHi8Y9WhP7TC/aRTby+ri/e6JjYpozu8u1tI8IxwDtI8KZ3eVafe+qICgTcWJiIiNGjHD3gKOjoxkxYkSt5ocvLoM4ePDgy8ogJiQk0KNHD48yiA6Hg8TEROLj43n22WdrdN2HHnqIzMxMEhISWLRokfu6LVq0oF+/fnTr1o0pU6ZU+LyKDBo0iM8++8y9WAtg5MiRnD171mNY2rUQ7L777iMpKYmRI0deMeaf/exntG/fnq+//pr27du7F7XV5FwiUjtT42KJCvHsFXv0RP1YU1kfoGpGZRAbkMzMTCZNmkRGRobdoVRIPwci1XfxBijuofyY5pfXVAbnRiR+uAdaPFVWBrFWc8TGmBnAo0DehaafW5Z15V0kxO9mzpzJn/70J4+5YRGpH8bENPfe+0x//vKiFQ2sWEUw8MXQ9BzLspIu/FESDlDPPPMMX3zxBbfeeqvdoYiIv1RYU1n3+AeSoJwjFhGRKqioKEUdF6uQ6vFFIn7SGJNljFlgjLnGB+cTERFfSJ3unBO+mB+KVUj1XDERG2PWG2M+9fJnFPAnoBOQBOQAv63kPI8ZYzKNMZmurRZFRKQO2VSsQqrniou1LMu6oyonMsb8Bfh7JeeZD8wH56rpqgYoIiK1YEOxCqmeWg1NG2Mu3jLlHuDT2oUjIiLSsNR2jvg3xpjdxpgsYBAwyQcxVUlO7mo2bepP+offYtOm/uTkXl6NKBC5avgePnyYv/71rzU+z8CBA7n0Xmxw3iv81FNP1fi8lxo/fry7tOLFLq59vHDhQo96xSLitObQGoasGELim4kMWTGENYfW2B2SBKBa3UdsWdb3fBVIdeTkriY7exrl5c7744pLjpOdPQ2A2JhRdoTklbd6xC6uRPyd73zHp9dMSUkhJcXrPeM+dXHtYxG53JpDa5ixeQbFZc494XPO5TBj8wwAhscNtzEyCTRBefvSoYOz3UnYpby8iEMHZ9f4nP6uR/zMM8+QkZFBUlISc+bM4fDhw/Tv35/k5GSSk5PZvHmz+7kvv/wyCQkJdO/enWeeecbd/vbbb9OrVy86d+7s3i3ro48+4u67767wujNmzHBXjALo1q0bhw8fBmDRokUkJibSvXt3vve9bz5jbdy4kVtuuYW4uDh38r30/Tp+/DjDhg3jhhtu4Gc/+5m7/X/+539ISUkhPj6eX/7ylxXGJVLfzNsxz52EXYrLipm3Y55NEUmgCsrqS8Ul3gtNV9ReW76sR3zxOWfPnu0uLvH111/zwQcfEBkZyYEDB3jwwQfJzMzk/fffZ/Xq1WzdupWrrrqKU6e+KfhdWlrKtm3beO+993juuedqVDfYZc+ePbzwwgts3ryZli1belwnJyeHjz/+mOzsbEaOHOkekr7Yrl272LlzJxEREXTp0oWf/OQnXHvttbz44os0b96csrIyUlNTycrKqpOa0SJVVZiWxok5cynNySEsNpbWkyYSPWKEz6+Tey63Wu3ScAVljzgywntZrYraa8tVj/itt95yDzWvW7eOmTNnkpSUxMCBA6tcj7giDoeDRx99lISEBO677z4+++wzwFlW8OGHH+aqq64C8DjvvffeC8DNN9/s7tXW1Icffsh9991Hy5YtL7vO6NGjCQkJ4aabbvJaKxmcFbGio6OJjIzkpptucleHWr58OcnJyfTo0YM9e/a4X5eIHQrT0sh5djqlx4+DZVF6/Dg5z06nMC3N59eKaey9VnhF7dJwBWUijus0mZAQz5vUQ0KiiOs0ucbnvFI94h//+Mfs2LGDnj17Ulpa6q5HvGvXLnbt2sWXX37pLlZQnXrELnPmzKFNmzZ88sknZGZmcv78+Sse46r3GxoaSmlpaZWuU9nrvNJ1ACoqEnLxc1zx/Oc//2H27Nmkp6eTlZXF8OHDq1VDWcTXTsyZi3XJz6BVXMyJOXN9fq0JyROIDI30aIsMjWRC8gSfX0uCW1Am4tiYUXTt+iKREW0BQ2REW7p2fbFWC7X8XY/YW73h2NhYQkJC+L//+z/KysoAZ+/6jTfe4OuvvwbwGDKuiQ4dOrBjxw7AOYT+n//8B4Dbb7+dt99+m5MnT/rkOgCnT5+mcePGREdH89VXX/H+++/X+pwitVGa4336qqL22hgeN5wZt8wgtnEsBkNs41hm3DJDC7XkMkE5RwzOZOzLFdIX1yNu167dZfWICwsLsSzLox7xxIkTSUxMpLy8nI4dO7qTd1UkJiYSGhpK9+7dGT9+PE888QRjxoxh0aJFDBs2zN2rHjZsGLt27SIlJYVGjRpx11138etf/7rGr9N1jfj4eHr37k3nzp0BiI+PZ9q0adx2222EhobSo0cPFi5cWOPrAHTv3p0ePXrQtWtXrr32Wvr161er84nUVlhsrHNY2kt7XRgeN1yJV65I9YgloOjnQOqSa4744uFpExlJ7K+er5MFWyIudVaPWEQkmLiSrT9WTYtUlRJxPfXGG28wb57n/Yr9+vXjj3/8o00RiQSG6BEjlHgloCgR11MPP/wwDz/8sN1hiDRIezM2kLF0EWdO5tO0RUv6jx3Hjf0H2R2WBCglYhERH9qbsYF181+h9HwJAGfy81g3/xUAJWPxKihvXxIRCVQZSxe5k7BL6fkSMpYusikiCXRKxCIiPnTmZH612kWUiP3MVQaxNseWl5fz1FNP0a1bNxISEujZs6d7Y46qeu2111i0SJ/QRXytaYuW1WoXCdo54pW5p3jpUA7HShy0iwhnalwsY2Kqt79zXausDGJtLFu2jOPHj5OVlUVISAhHjx6t1raapaWlPP744z6P61JlZWWEhobW+XVEAkn/seM85ogBwhpF0H/sOBujkkAWlD3ilbmnmLzvCEdLHFjA0RIHk/cdYWVuzbdl9HcZRMuymDJlirtXu2zZMvdjs2bNomfPniQmJnotHZiTk+PeDhOgffv2XHPNNYCz1zxp0iTi4+NJTU0lLy8PgIEDBzJx4kRSUlKYN2+eRznEgQMH8vTTT19WUvHrr7/m/vvv56abbuKee+6hd+/euDZiWbduHX379iU5OZn77ruPs2fPAs4tNJ9++mmSk5N5++23q/ldEAl+N/YfxJDHnqRpy1ZgDE1btmLIY09qoZZUKCh7xC8dyqGo3HNHsKJyi5cO5dRJr7guyiC+88477Nq1i08++YT8/Hx69uzJgAED2L17NwcOHGDbtm1YlsXIkSPZuHEjAwYMcB97//33c+utt5KRkUFqairf/e536dGjB+D8gJCSksKcOXN4/vnnee6553jlFeeKzfPnz7sTqetDhou3koqvvvoq11xzDZ999hmffvopSUlJAOTn5/PCCy+wfv16GjduzMsvv8zvfvc7pk+fDkCLFi3c+1mLNEQ39h+kxCtVFpQ94mMljmq111ZdlEH8+OOPefDBBwkNDaVNmzbcdttt/Pvf/2bdunWsW7eOHj16kJycTHZ2NgcOHPA4tn379uzbt4+XXnqJkJAQUlNTSU9PByAkJIQHHngAgO9+97t8/PHH7uNc7d54K6n48ccfu0cAunXr5q4j/K9//YvPPvuMfv36kZSUxJtvvukue3il64iIiKeg7BG3iwjnqJek2y4ivMbnvFIZxI0bN5KWlsaLL77I7t273WUQu3Tp4nGerVu31qgMootlWUydOpUf/ehHlT4vIiKCO++8kzvvvJM2bdqwatUqr0Phxhj3vyuLqzolFS3LYvDgwSxZssTr47V5/SIiDU1Q9oinxsUSFWI82qJCDFPjal5Bxd9lEPv378+yZcsoKysjLy+PjRs30qtXL4YOHcqCBQvcc67Hjh3jxIkTHsfu2LGD4xcqyJSXl5OVlcX111/v/nrFihUA/PWvf+XWW2+t8XvSr18/li9fDsBnn33G7t27AejTpw+bNm3i888/B5zD4fv376/xdUREGrKg7BG75oF9uWraH2UQS0tL3T3Pe+65hy1bttC9e3eMMfzmN78hJiaGmJgY9u7dS9++fQHn4qu33nqL1q1bu89z4sQJHn30UUpKnKsye/XqxZNPPgk4e6Pbtm3jhRdeoHXr1h6LwKrriSee4Pvf/z433XQTXbt2JT4+nujoaFq1asXChQt58MEH3TG88MIL7pKKIiJSdSqD6EeffPIJjz76KNu2bauzazRp0sTdm66tsrIyHA4HkZGRHDx4kDvuuIN9+/bRqFEjn5zfm4bwcyAiDY/KIAaA1157jd///vfMnTvX7lCq7Ouvv2bQoEE4HA4sy+LVV1+t0yQsItIQKRH7yeOPP+6XTTR81RsGaNq0KZeOXIiIiG8F5WItERGR+iKgErEd89USOPT9F5GGKGAScWRkJCdPntQv4wbKsixOnjxJZGSk3aGIiPhVwMwRt2/fnqNHj7r3RpaGJzIykvbt29sdhoiIXwVMIg4PD6djx452hyEiIuJXATM0LSIi0hApEYuIiNhIiVhERMRGtmxxaYzJA76o5CktgXw/hWMHvb7gptcX3PT6gluwvr7rLctq5e0BWxLxlRhjMivak7M+0OsLbnp9wU2vL7jVx9enoWkREREbKRGLiIjYKFAT8Xy7A6hjen3BTa8vuOn1Bbd69/oCco5YRESkoQjUHrGIiEiDEFCJ2BhznzFmjzGm3BiTcsljU40xnxtj9hljhtoVo68YY5KMMf8yxuwyxmQaY3rZHZOvGWN+YozJvvA9/Y3d8dQFY8xPjTGWMaal3bH4kjFm1oXvXZYx5m/GmGZ2x+QLxphhF36HfG6MecbueHzJGHOtMWaDMeazC//nJtgdk68ZY0KNMTuNMX+3OxZfCqhEDHwK3AtsvLjRGHMTMBaIB4YBrxpjQv0fnk/9BnjOsqwkYPqFr+sNY8wgYBTQ3bKseGC2zSH5nDHmWmAI8KXdsdSBD4BulmUlAvuBqTbHU2sXfmf8EbgTuAl48MLvlvqiFPipZVk3AX2AH9ez1wcwAdhrdxC+FlCJ2LKsvZZl7fPy0ChgqWVZJZZl/Qf4HAj2HqQFXH3h39HAcRtjqQv/A8y0LKsEwLKsEzbHUxfmAD/D+b2sVyzLWmdZVumFL/8F1IeyWL2Azy3LOmRZ1nlgKc7fLfWCZVk5lmXtuPDvMzgTVjt7o/IdY0x7YDjw/+yOxdcCKhFXoh1w5KKvjxL8P2ATgVnGmCM4e4tB3+O4RGegvzFmqzHmn8aYnnYH5EvGmFHAMcuyPrE7Fj/4AfC+3UH4QH38PeKVMaYD0APYanMovjQX5wffcpvj8Dm/l0E0xqwHYrw8NM2yrNX+jqcuVfZagVRgkmVZK40x9wOvA3f4M77ausLrCwOa4xwi6wksN8bEWUG0TP8Kr+/nOIelg1ZV/i8aY6bhHPJc7M/YpOaMMU2AlcBEy7JO2x2PLxhj7gZOWJa13Rgz0OZwfM7vidiyrJokm2PAtRd93f5CW0Cr7LUaYxbhnO8AeJsgHG65wuv7H+CdC4l3mzGmHOcesXn+iq+2Knp9xpgEoCPwiTEGnD+PO4wxvSzLyvVjiLVypf+LxpjxwN1AajB9gKpEUP4eqQ5jTDjOJLzYsqx37I7Hh/oBI40xdwGRwNXGmLcsy/quzXH5RLAMTb8LjDXGRBhjOgI3ANtsjqm2jgO3Xfj37cABG2OpC6uAQQDGmM5AI4Jzo/bLWJa127Ks1pZldbAsqwPOIc7kYErCV2KMGYZzGHCkZVlf2x2Pj/wbuMEY09EY0wjnAtB3bY7JZ4zzU+HrwF7Lsn5ndzy+ZFnWVMuy2l/4/zYW+LC+JGGwoUdcGWPMPcAfgFbAGmPMLsuyhlqWtccYsxz4DOcw2Y8tyyqzM1YfeBSYZ4wJA4qBx2yOx9cWAAuMMZ8C54Hv15NeVUPxChABfHCh1/8vy7Ietzek2rEsq9QY8ySwFggFFliWtcfmsHypH/A9YLcxZteFtp9blvWefSFJVWhnLRERERsFy9C0iIhIvaRELCIiYiMlYhERERspEYuIiNhIiVhERMRGSsQiIiI2UiIWERGxkRKxiIiIjf4/plhe2/YJr1kAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# task 2 - show on user-level\n", + "plt.figure(figsize=(8, 8))\n", + "users = np.unique(dataset.user.values)\n", + "for user in users[0:10]:\n", + " good_ids = dataset[dataset.user == user].id.values\n", + " indexes = np.where([n in good_ids for n in ids])[0]\n", + " xs = x[indexes].T\n", + " plt.plot(xs[0], xs[1], 'o', label='user ' + user)\n", + " \n", + "plt.legend()\n", + "plt.savefig('SMALLscaaU_pooling100user')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffa83416-2037-49b8-aab7-21a0dcf4fc82", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/README.md b/README.md index 40389212..1dea0b64 100644 --- a/README.md +++ b/README.md @@ -8,10 +8,100 @@ Library for analyzing source code with graphs and NLP. What this repository can 5. Train Graph Neural Network for learning representations for source code 6. Predict Python types using NLP and graph embeddings -### Installation +For more details consult our [wiki](https://github.com/VitalyRomanov/method-embedding/wiki). +### Installing Python Libraries + +You need to use conda, create virtual environment `SourceCodeTools` with python 3.8 +```bash +conda create -n SourceCodeTools python=3.8 +``` + +If you plan to use graphviz +```python +conda install -c conda-forge pygraphviz graphviz +``` + +Install CUDA 11.1 if needed +```python +conda install -c nvidia cudatoolkit=11.1 +``` + +To install SourceCodeTools library run ```bash git clone https://github.com/VitalyRomanov/method-embedding.git cd method-embedding pip install -e . -``` \ No newline at end of file +# pip install -e .[gpu] +``` + +### Processing Python Code + +Source code should be structured in the following way +``` +source_code_data +│ +└───package1 +│ │───source_file_1.py +│ │───source_file_2.py +│ └───subfolder_if_needed +│ │───source_file_3.py +│ └───source_file_4.py +│ +└───package2 + │───source_file_1.py + └───source_file_2.py +``` +An example of source code data can be found in this repository `method-embedding\res\python_testdata\example_code`. A package should contain self-sufficient code with its dependencies. Unmet dependencies will be labeled as non-indexed symbol. + +#### Indexing with Docker +To create dataset need to first perform indexing with Sourcetrail. The easiest way to do this is with a docker container +```bash +docker run -it -v "/full/path/to/data/folder":/dataset mortiv16/sourcetrail_indexer +``` + +#### Creating graph +Need to provide a sentencepiece model for subtokenization. Model trained on CodeSearchNet can be downloaded [here](https://www.dropbox.com/s/cw7oxkzicgnkzgb/sentencepiece_bpe.model?dl=1). +```bash +SCT=/path/to/SourceCodeTool_repository +SOURCE_CODE=/path/to/source/code/indexed/with/sourcetrail +DATASET_OUTPUT=/path/to/dataset/output +python $SCT/SourceCodeTools/code/data/sourcetrail/DatasetCreator2.py --bpe_tokenizer sentencepiece_bpe.model --track_offsets --do_extraction $SOURCE_CODE $DATASET_OUTPUT +``` + +The graph dataset format is [described in wiki](https://github.com/VitalyRomanov/method-embedding/wiki/04.-Graph-Format-Description) +``` +graph_dataset +│ +└───no_ast +│ │───common_call_seq.bz2 +│ │───common_edges.bz2 +│ │───common_function_variable_pairs.bz2 +│ │───common_nodes.bz2 +│ │───common_source_graph_bodies.bz2 +│ └───node_names.bz2 +│ +└───with_ast + │───common_call_seq.bz2 + │───common_edges.bz2 + │───common_function_variable_pairs.bz2 + │───common_nodes.bz2 + │───common_source_graph_bodies.bz2 + └───node_names.bz2 +``` + +`no_ast` contains graph built from global relationships only. `with_ast` contains graph with AST nodes and edges. Two main files for building the graph are `common_nodes.bz2` and `common_edges.bz2`. The files are stored as pickled pandas table (read with `pandas.read_pickle`) and probably not portable between platforms. One can view the content by converting table into the `csv` format +```bash +python $SCT/SourceCodeTools/code/data/sourcetrail/pandas_format_converter.py common_nodes.bz2 csv +``` + +The graph data can be loaded as pandas tables using `load_data` function + +```python +from SourceCodeTools.code.data.dataset.Dataset import load_data + +nodes, edges = load_data( + node_path="path/to/common_nodes.bz2", + edge_path="path/to/common_edges.bz2" +) +``` diff --git a/SCAA AuthorMatching.ipynb b/SCAA AuthorMatching.ipynb new file mode 100644 index 00000000..160b080c --- /dev/null +++ b/SCAA AuthorMatching.ipynb @@ -0,0 +1,7160 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "244ac956", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%load_ext tensorboard" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bdefc65e", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "from random import random\n", + "\n", + "from SourceCodeTools.models.training_config import get_config, save_config, load_config\n", + "from SourceCodeTools.code.data.dataset.Dataset import SourceGraphDataset, filter_dst_by_freq\n", + "from SourceCodeTools.models.graph.train.sampling_multitask2 import training_procedure, SamplingMultitaskTrainer\n", + "from SourceCodeTools.models.graph.train.objectives.NodeClassificationObjective import NodeClassifierObjective\n", + "from SourceCodeTools.models.graph.train.objectives.SubgraphClassifierObjective import SubgraphAbstractObjective, \\\n", + " SubgraphClassifierObjective, SubgraphEmbeddingObjective\n", + "from SourceCodeTools.models.graph.train.objectives.SubgraphEmbedderObjective import SubgraphEmbeddingObjective, \\\n", + " SubgraphMatchingObjective\n", + "from SourceCodeTools.models.graph.train.utils import get_name, get_model_base\n", + "from SourceCodeTools.models.graph import RGGAN\n", + "from SourceCodeTools.tabular.common import compact_property\n", + "from SourceCodeTools.code.data.file_utils import unpersist\n", + "from SourceCodeTools.models.graph.train.objectives.SCAAClassifierObjetive import SCAAClassifierObjective\n", + "\n", + "\n", + "import dgl\n", + "import torch\n", + "import numpy as np\n", + "from argparse import Namespace\n", + "from torch import nn\n", + "from datetime import datetime\n", + "from os.path import join\n", + "from functools import partial" + ] + }, + { + "cell_type": "markdown", + "id": "4becf482", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "tags": [] + }, + "source": [ + "# Prepare parameters and options\n", + "\n", + "Full list of options that can be added can be found in `SourceCodeTools/models/training_options.py`. They are ment to be used as arguments for cli trainer. Trainer script can be found in `SourceCodeTools/scripts/train.py`.\n", + "\n", + "For the task of subgraph classification the important options are:\n", + "- `subgraph_partition` is path to subgraph-based train/val/test sets. Storead as Dataframe with subgraph id and partition mask\n", + "- `subgraph_id_column` is a column is `common_edges` file that stores subgraph id.\n", + "- For variable misuse task (same will apply to authorship attribution) subgraphs are created for individual functions (files for SCAA). The label is stored in `common_filecontent`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5637f70e-b8fd-4a9f-9956-d60a61bd2870", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "tokenizer_path = \"sentencepiece_bpe.model\"\n", + "\n", + "data_path = \"./examples/one_vs_10/with_ast\"\n", + "subgraph_partition = join(data_path, \"subgraph_partition.json.bz2\")\n", + "filecontent_path = join(data_path, \"common_filecontent.json.bz2\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "404565f5-09a3-4535-ba51-8533491f7c72", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtrain_maskval_masktest_mask
0888TrueFalseFalse
1887TrueFalseFalse
2644TrueFalseFalse
35341FalseFalseTrue
45318TrueFalseFalse
...............
855314TrueFalseFalse
86957TrueFalseFalse
875311FalseTrueFalse
88454TrueFalseFalse
895334TrueFalseFalse
\n", + "

90 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " id train_mask val_mask test_mask\n", + "0 888 True False False\n", + "1 887 True False False\n", + "2 644 True False False\n", + "3 5341 False False True\n", + "4 5318 True False False\n", + ".. ... ... ... ...\n", + "85 5314 True False False\n", + "86 957 True False False\n", + "87 5311 False True False\n", + "88 454 True False False\n", + "89 5334 True False False\n", + "\n", + "[90 rows x 4 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unpersist(subgraph_partition)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d09b740a-eaaf-4fac-a73f-0bd9bb671f70", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtypesource_node_idtarget_node_idfile_idmentioned_in
00subword702516744888NaN
11subword1763716744888NaN
22subword396616744888NaN
33module1674490588885968.0
44module_rev9058167448885968.0
55subword1387513928888NaN
66name1392831048885968.0
77name_rev3104139288885968.0
88names310490588885968.0
99names_rev905831048885968.0
\n", + "
" + ], + "text/plain": [ + " id type source_node_id target_node_id file_id mentioned_in\n", + "0 0 subword 7025 16744 888 NaN\n", + "1 1 subword 17637 16744 888 NaN\n", + "2 2 subword 3966 16744 888 NaN\n", + "3 3 module 16744 9058 888 5968.0\n", + "4 4 module_rev 9058 16744 888 5968.0\n", + "5 5 subword 13875 13928 888 NaN\n", + "6 6 name 13928 3104 888 5968.0\n", + "7 7 name_rev 3104 13928 888 5968.0\n", + "8 8 names 3104 9058 888 5968.0\n", + "9 9 names_rev 9058 3104 888 5968.0" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unpersist(join(data_path, \"common_edges.json.bz2\"), nrows=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d38f83db", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "config = get_config(\n", + " subgraph_id_column=\"file_id\",\n", + " data_path=data_path,\n", + " model_output_dir=data_path,\n", + " subgraph_partition=subgraph_partition,\n", + " tokenizer_path=tokenizer_path,\n", + " objectives=\"subgraph_clf\",\n", + " measure_scores=True,\n", + " dilate_scores=1,\n", + " epochs=100,\n", + " learning_rate=0.001,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "bf61e1a2-2a2b-4688-b6b6-4ad3c22b22a5", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "save_config(config, \"var_misuse_tiny.yaml\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "07dc7b86", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'DATASET': {'data_path': './examples/one_vs_10/with_ast',\n", + " 'train_frac': 0.9,\n", + " 'filter_edges': None,\n", + " 'min_count_for_objectives': 5,\n", + " 'self_loops': False,\n", + " 'use_node_types': False,\n", + " 'use_edge_types': False,\n", + " 'no_global_edges': False,\n", + " 'remove_reverse': False,\n", + " 'custom_reverse': None,\n", + " 'restricted_id_pool': None,\n", + " 'random_seed': None,\n", + " 'subgraph_id_column': 'file_id',\n", + " 'subgraph_partition': './examples/one_vs_10/with_ast/subgraph_partition.json.bz2'},\n", + " 'TRAINING': {'model_output_dir': './examples/one_vs_10/with_ast',\n", + " 'pretrained': None,\n", + " 'pretraining_phase': 0,\n", + " 'sampling_neighbourhood_size': 10,\n", + " 'neg_sampling_factor': 3,\n", + " 'use_layer_scheduling': False,\n", + " 'schedule_layers_every': 10,\n", + " 'elem_emb_size': 100,\n", + " 'embedding_table_size': 200000,\n", + " 'epochs': 100,\n", + " 'batch_size': 128,\n", + " 'learning_rate': 0.001,\n", + " 'objectives': 'subgraph_clf',\n", + " 'save_each_epoch': False,\n", + " 'save_checkpoints': True,\n", + " 'early_stopping': False,\n", + " 'early_stopping_tolerance': 20,\n", + " 'force_w2v_ns': False,\n", + " 'use_ns_groups': False,\n", + " 'nn_index': 'brute',\n", + " 'metric': 'inner_prod',\n", + " 'measure_scores': True,\n", + " 'dilate_scores': 1,\n", + " 'gpu': -1,\n", + " 'external_dataset': None,\n", + " 'restore_state': False},\n", + " 'MODEL': {'node_emb_size': 100,\n", + " 'h_dim': 100,\n", + " 'n_layers': 5,\n", + " 'use_self_loop': True,\n", + " 'use_gcn_checkpoint': False,\n", + " 'use_att_checkpoint': False,\n", + " 'use_gru_checkpoint': False,\n", + " 'num_bases': 10,\n", + " 'dropout': 0.0,\n", + " 'activation': 'tanh'},\n", + " 'TOKENIZER': {'tokenizer_path': 'sentencepiece_bpe.model'}}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "config" + ] + }, + { + "cell_type": "markdown", + "id": "069a2528", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# Create Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d0ce29fc", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/alina/workspace/scaa/method-embedding/SourceCodeTools/code/data/dataset/Dataset.py:131: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n", + " self.nodes = self.nodes.append(new_nodes, ignore_index=True)\n", + "/home/alina/workspace/scaa/method-embedding/SourceCodeTools/code/data/dataset/Dataset.py:132: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n", + " self.edges = self.edges.append(new_edges, ignore_index=True)\n" + ] + } + ], + "source": [ + "dataset = SourceGraphDataset(\n", + " **{**config[\"DATASET\"], **config[\"TOKENIZER\"]}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "897d41bf", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# Declare target loading function (labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c279f43e", + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def load_labels():\n", + " filecontent = unpersist(filecontent_path)\n", + " return filecontent[[\"id\", \"user\"]].rename({\"id\": \"src\", \"user\": \"dst\"}, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "717a502c-4399-4e15-a83a-4f16c8e9639c", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
srcdst
0888Itachi_uchiha
1887Itachi_uchiha
2644Fatality2412
35341Fatality2412
45318Chuckminster
.........
855314BarryFam
86957JoelSpringer
875311Avi_Sri
88454Chuckminster
895334DrMeers
\n", + "

90 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " src dst\n", + "0 888 Itachi_uchiha\n", + "1 887 Itachi_uchiha\n", + "2 644 Fatality2412\n", + "3 5341 Fatality2412\n", + "4 5318 Chuckminster\n", + ".. ... ...\n", + "85 5314 BarryFam\n", + "86 957 JoelSpringer\n", + "87 5311 Avi_Sri\n", + "88 454 Chuckminster\n", + "89 5334 DrMeers\n", + "\n", + "[90 rows x 2 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "load_labels()" + ] + }, + { + "cell_type": "markdown", + "id": "1ac27d90", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "One or several objectives could be used" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "67605a81", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "class Trainer(SamplingMultitaskTrainer):\n", + " def create_objectives(self, dataset, tokenizer_path):\n", + " self.objectives = nn.ModuleList()\n", + " \n", + " self.objectives.append(\n", + " self._create_subgraph_objective(\n", + " objective_name=\"SCAAMatching\",\n", + " objective_class=SCAAClassifierObjective,\n", + " dataset=dataset,\n", + " tokenizer_path=tokenizer_path,\n", + " labels_fn=load_labels,\n", + " )\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "393251c2", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "NOTE: Using experimental fast data loading logic. To disable, pass\n", + " \"--load_fast=false\" and report issues on GitHub. More details:\n", + " https://github.com/tensorflow/tensorboard/issues/4784\n", + "\n", + "2022-06-07 00:49:57.551867: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n", + "2022-06-07 00:49:57.551963: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tensorflow listening on http://localhost:6006/\n" + ] + } + ], + "source": [ + "# %tensorboard --logdir data\n", + "from tensorboard import program\n", + "\n", + "tracking_address = './examples/one_vs_10/with_ast' # the path of your log file.\n", + "\n", + "tb = program.TensorBoard()\n", + "tb.configure(argv=[None, '--logdir', tracking_address])\n", + "url = tb.launch()\n", + "print(f\"Tensorflow listening on {url}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "369ed1bb", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Precompute Target Embeddings: 0%| | 0/1 [00:00,\n", + " {'hits@1/train/SCAAMatching_final': 0.1,\n", + " 'hits@3/train/SCAAMatching_final': 0.08888888888888889,\n", + " 'hits@5/train/SCAAMatching_final': 0.0866666666666667,\n", + " 'hits@10/train/SCAAMatching_final': 0.10625,\n", + " 'ndcg@1/train/SCAAMatching_final': 0.08888888888888886,\n", + " 'ndcg@3/train/SCAAMatching_final': 0.08888888888888886,\n", + " 'ndcg@5/train/SCAAMatching_final': 0.08888888888888885,\n", + " 'ndcg@10/train/SCAAMatching_final': 0.10215646036989275,\n", + " 'mr/train/SCAAMatching_final': 23.666666666666668,\n", + " 'mrr/train/SCAAMatching_final': 0.14220794226854017,\n", + " 'map/train/SCAAMatching_final': 0.14952471428902422,\n", + " 'scoring_time/train/SCAAMatching_final': 0.03229260444641113,\n", + " 'Loss/train/SCAAMatching_final': 0.5069170594215393,\n", + " 'Accuracy/train/SCAAMatching_final': 0.5166666666666667,\n", + " 'hits@1/val/SCAAMatching_final': 0.1,\n", + " 'hits@3/val/SCAAMatching_final': 0.1,\n", + " 'hits@5/val/SCAAMatching_final': 0.1,\n", + " 'hits@10/val/SCAAMatching_final': 0.125,\n", + " 'ndcg@1/val/SCAAMatching_final': 0.08888888888888889,\n", + " 'ndcg@3/val/SCAAMatching_final': 0.08888888888888889,\n", + " 'ndcg@5/val/SCAAMatching_final': 0.08888888888888888,\n", + " 'ndcg@10/val/SCAAMatching_final': 0.10215646036989276,\n", + " 'mr/val/SCAAMatching_final': 23.5,\n", + " 'mrr/val/SCAAMatching_final': 0.15123215602874723,\n", + " 'map/val/SCAAMatching_final': 0.17977364167777116,\n", + " 'scoring_time/val/SCAAMatching_final': 0.008196592330932617,\n", + " 'Loss/val/SCAAMatching_final': 0.5524194836616516,\n", + " 'Accuracy/val/SCAAMatching_final': 0.55,\n", + " 'hits@1/test/SCAAMatching_final': 0.05,\n", + " 'hits@3/test/SCAAMatching_final': 0.08333333333333333,\n", + " 'hits@5/test/SCAAMatching_final': 0.09,\n", + " 'hits@10/test/SCAAMatching_final': 0.11875,\n", + " 'ndcg@1/test/SCAAMatching_final': 0.08888888888888888,\n", + " 'ndcg@3/test/SCAAMatching_final': 0.08888888888888888,\n", + " 'ndcg@5/test/SCAAMatching_final': 0.08888888888888885,\n", + " 'ndcg@10/test/SCAAMatching_final': 0.10215646036989276,\n", + " 'mr/test/SCAAMatching_final': 23.5,\n", + " 'mrr/test/SCAAMatching_final': 0.15123215602874715,\n", + " 'map/test/SCAAMatching_final': 0.1800709349374379,\n", + " 'scoring_time/test/SCAAMatching_final': 0.011611461639404297,\n", + " 'Loss/test/SCAAMatching_final': 0.47625288367271423,\n", + " 'Accuracy/test/SCAAMatching_final': 0.625})" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "training_procedure(\n", + " dataset, \n", + " model_name=RGGAN, \n", + " model_params=config[\"MODEL\"],\n", + " trainer_params=config[\"TRAINING\"],\n", + " model_base_path=get_model_base(config[\"TRAINING\"], 'u-pooling 100 epochs exp 3'),\n", + " trainer=Trainer\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "879cf5e3", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b91dd03", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "interpreter": { + "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/SourceCodeTools/cli_arguments/__init__.py b/SourceCodeTools/cli_arguments/__init__.py new file mode 100644 index 00000000..d72b3f5b --- /dev/null +++ b/SourceCodeTools/cli_arguments/__init__.py @@ -0,0 +1,53 @@ +import argparse + + +class AstDatasetCreatorArguments: + parser = None + + def __init__(self): + parser = argparse.ArgumentParser(description='Merge indexed environments into a single graph') + parser.add_argument('--language', "-l", dest="language", default="python", help='') + parser.add_argument('--bpe_tokenizer', '-bpe', dest='bpe_tokenizer', type=str, help='') + parser.add_argument('--create_subword_instances', action='store_true', default=False, help="") + parser.add_argument('--connect_subwords', action='store_true', default=False, + help="Takes effect only when `create_subword_instances` is False") + parser.add_argument('--only_with_annotations', action='store_true', default=False, help="") + parser.add_argument('--do_extraction', action='store_true', default=False, help="") + parser.add_argument('--visualize', action='store_true', default=False, help="") + parser.add_argument('--track_offsets', action='store_true', default=False, help="") + parser.add_argument('--recompute_l2g', action='store_true', default=False, help="") + parser.add_argument('--remove_type_annotations', action='store_true', default=False, help="") + parser.add_argument('--seed', type=int, default=None, help="") + + self.parser = parser + self.add_positional_argument() + self.additional_arguments() + + def add_positional_argument(self): + self.parser.add_argument( + 'source_code', help='Path to DataFrame csv.' + ) + self.parser.add_argument('output_directory', help='') + + def additional_arguments(self): + self.parser.add_argument('--chunksize', default=10000, type=int, help='Chunksize for preparing dataset. Larger chunks are faster to process, but they take more memory.') + self.parser.add_argument('--keep_frac', default=1.0, type=float, help="Fraction of the dataset to keep") + + def parse(self): + return self.parser.parse_args() + + +class DatasetCreatorArguments(AstDatasetCreatorArguments): + + def add_positional_argument(self): + self.parser.add_argument('indexed_environments', help='Path to environments indexed by sourcetrail') + self.parser.add_argument('output_directory', help='') + + def additional_arguments(self): + pass + + + + + + diff --git a/SourceCodeTools/code/IdentifierPool.py b/SourceCodeTools/code/IdentifierPool.py new file mode 100644 index 00000000..6b2667be --- /dev/null +++ b/SourceCodeTools/code/IdentifierPool.py @@ -0,0 +1,38 @@ +from os import urandom +from time import time_ns + + +class IdentifierPool: + """ + Creates identifier that is almost guaranteed to be unique. Beginning of identifier is based on + current time, and the tail of identifier is randomly generated. + """ + def __init__(self): + self._used_identifiers = set() + + @staticmethod + def _get_candidate(): + return str(hex(int(time_ns())))[:12] + str(urandom(3).hex()) + # return "0x" + str(urandom(8).hex()) + + def get_new_identifier(self): + candidate = self._get_candidate() + while candidate in self._used_identifiers: + candidate = self._get_candidate() + self._used_identifiers.add(candidate) + return candidate + + +class IntIdentifierPool(IdentifierPool): + def __init__(self): + super().__init__() + + @staticmethod + def _get_candidate(): + candidate = str(int((str(hex(int(time_ns())))[:12] + str(urandom(3).hex())), 16)) + # assert len(candidate) == 19 + return candidate + # candidate = str(int(urandom(10).hex(), 16)) + # while len(candidate) < 19: + # candidate = str(int(urandom(10).hex(), 16)) + # return candidate[:19] \ No newline at end of file diff --git a/SourceCodeTools/nlp/entity/annotator/annotator_utils.py b/SourceCodeTools/code/annotator_utils.py similarity index 67% rename from SourceCodeTools/nlp/entity/annotator/annotator_utils.py rename to SourceCodeTools/code/annotator_utils.py index 3dfba4c6..469d95a7 100644 --- a/SourceCodeTools/nlp/entity/annotator/annotator_utils.py +++ b/SourceCodeTools/code/annotator_utils.py @@ -1,6 +1,34 @@ -from copy import copy +from copy import copy, deepcopy from typing import List, Tuple, Iterable +from SourceCodeTools.nlp import create_tokenizer +from spacy.gold import biluo_tags_from_offsets as spacy_biluo_tags_from_offsets + +from SourceCodeTools.nlp.tokenizers import codebert_to_spacy + + +def biluo_tags_from_offsets(doc, ents, no_localization): + ent_tags = spacy_biluo_tags_from_offsets(doc, ents) + + if no_localization: + tags = [] + for ent in ent_tags: + parts = ent.split("-") + + assert len(parts) <= 2 + + if len(parts) == 2: + if parts[0] == "B" or parts[0] == "U": + tags.append(parts[1]) + else: + tags.append("O") + else: + tags.append("O") + + ent_tags = tags + + return ent_tags + def get_cum_lens(body, as_bytes=False): """ @@ -57,10 +85,22 @@ def to_offsets(body: str, entities: Iterable[Iterable], as_bytes=False, cum_lens def adjust_offsets(offsets, amount): + """ + Adjust offset by subtracting certain amount from the start and end positions + :param offsets: iterable with offsets + :param amount: adjustment amount + :return: list of adjusted offsets + """ return [(offset[0] - amount, offset[1] - amount, offset[2]) for offset in offsets] def adjust_offsets2(offsets, amount): + """ + Adjust offset by adding certain amount to the start and end positions + :param offsets: iterable with offsets + :param amount: adjustment amount + :return: list of adjusted offsets + """ return [(offset[0] + amount, offset[1] + amount, offset[2]) for offset in offsets] @@ -143,4 +183,33 @@ def resolve_self_collisions2(offsets): no_collisions = list(set(no_collisions)) - return no_collisions \ No newline at end of file + return no_collisions + + +def align_tokens_with_graph(doc, spans, tokenzer_name): + spans = deepcopy(spans) + if tokenzer_name == "codebert": + backup_tokens = doc + doc, adjustment = codebert_to_spacy(doc) + spans = adjust_offsets(spans, adjustment) + + node_tags = biluo_tags_from_offsets(doc, spans, no_localization=False) + + if tokenzer_name == "codebert": + doc = [""] + [t.text for t in backup_tokens] + [""] + return doc, node_tags + + +def source_code_graph_alignment(source_codes, node_spans, tokenizer="codebert"): + supported_tokenizers = ["spacy", "codebert"] + assert tokenizer in supported_tokenizers, f"Only these tokenizers supported for alignment: {supported_tokenizers}" + nlp = create_tokenizer(tokenizer) + + for code, spans in zip(source_codes, node_spans): + yield align_tokens_with_graph(nlp(code), resolve_self_collisions2(spans), tokenzer_name=tokenizer) + + +def map_offsets(column, id_map): + def map_entry(entry): + return [(e[0], e[1], id_map[e[2]]) for e in entry] + return [map_entry(entry) for entry in column] \ No newline at end of file diff --git a/SourceCodeTools/code/ast/__init__.py b/SourceCodeTools/code/ast/__init__.py new file mode 100644 index 00000000..2e067ec7 --- /dev/null +++ b/SourceCodeTools/code/ast/__init__.py @@ -0,0 +1,9 @@ +import ast + + +def has_valid_syntax(function_body): + try: + ast.parse(function_body.lstrip()) + return True + except SyntaxError: + return False diff --git a/SourceCodeTools/code/ast_tools.py b/SourceCodeTools/code/ast/ast_tools.py similarity index 85% rename from SourceCodeTools/code/ast_tools.py rename to SourceCodeTools/code/ast/ast_tools.py index 98b5571d..ee3d09d6 100644 --- a/SourceCodeTools/code/ast_tools.py +++ b/SourceCodeTools/code/ast/ast_tools.py @@ -1,6 +1,6 @@ import ast -from SourceCodeTools.nlp.entity.annotator.annotator_utils import to_offsets, resolve_self_collision, adjust_offsets2 +from SourceCodeTools.code.annotator_utils import to_offsets, resolve_self_collision, adjust_offsets2 def get_mentions(function, root, mention): @@ -30,9 +30,18 @@ def get_mentions(function, root, mention): def get_descendants(function, children): + """ + :param function: function string + :param children: List of targets. + :return: Offsets for attributes or names that are used as target for assignment operation. Subscript, Tuple and List + targets are skipped. + """ descendants = [] + # if isinstance(children, ast.Tuple): + # descendants.extend(get_descendants(function, children.elts)) + # else: for chld in children: # for node in ast.walk(chld): node = chld @@ -42,6 +51,10 @@ def get_descendants(function, children): [(node.lineno-1, node.end_lineno-1, node.col_offset, node.end_col_offset, "new_var")], as_bytes=True) # descendants.append((node.id, offset[-1])) descendants.append((function[offset[-1][0]:offset[-1][1]], offset[-1])) + # elif isinstance(node, ast.Tuple): + # descendants.extend(get_descendants(function, node.elts)) + elif isinstance(node, ast.Subscript) or isinstance(node, ast.Tuple) or isinstance(node, ast.List): + pass # skip for now else: raise Exception("") diff --git a/SourceCodeTools/code/python_ast.py b/SourceCodeTools/code/ast/python_ast.py similarity index 87% rename from SourceCodeTools/code/python_ast.py rename to SourceCodeTools/code/ast/python_ast.py index 07b8f7aa..58927e49 100644 --- a/SourceCodeTools/code/python_ast.py +++ b/SourceCodeTools/code/ast/python_ast.py @@ -6,6 +6,7 @@ from collections.abc import Iterable import pandas as pd # import os +from SourceCodeTools.code.IdentifierPool import IdentifierPool class PythonSyntheticNodeTypes(Enum): # TODO NOT USED @@ -76,6 +77,7 @@ class GNode: # id = None def __init__(self, **kwargs): + self.string = None for k, v in kwargs.items(): setattr(self, k, v) @@ -104,38 +106,53 @@ def __init__(self, source): self.condition_status = [] self.scope = [] - def get_source_from_ast_range(self, start_line, end_line, start_col, end_col): + self._identifier_pool = IdentifierPool() + + def get_source_from_ast_range(self, start_line, end_line, start_col, end_col, strip=True): source = "" num_lines = end_line - start_line + 1 if start_line == end_line: - source += self.source[start_line - 1].encode("utf8")[start_col:end_col].decode( - "utf8").strip() + section = self.source[start_line - 1].encode("utf8")[start_col:end_col].decode( + "utf8") + source += section.strip() if strip else section + "\n" else: for ind, lineno in enumerate(range(start_line - 1, end_line)): if ind == 0: - source += self.source[lineno].encode("utf8")[start_col:].decode( - "utf8").strip() + section = self.source[lineno].encode("utf8")[start_col:].decode( + "utf8") + source += section.strip() if strip else section + "\n" elif ind == num_lines - 1: - source += self.source[lineno].encode("utf8")[:end_col].decode( - "utf8").strip() + section = self.source[lineno].encode("utf8")[:end_col].decode( + "utf8") + source += section.strip() if strip else section + "\n" else: - source += self.source[lineno].strip() + section = self.source[lineno] + source += section.strip() if strip else section + "\n" - return source + return source.rstrip() def get_name(self, *, node=None, name=None, type=None, add_random_identifier=False): + random_identifier = self._identifier_pool.get_new_identifier() + if node is not None: - name = node.__class__.__name__ + "_" + str(hex(int(time_ns()))) + name = f"{node.__class__.__name__}_{random_identifier}" type = node.__class__.__name__ else: if add_random_identifier: - name += f"_{str(hex(int(time_ns())))}" + name = f"{name}_{random_identifier}" + + if hasattr(node, "lineno"): + node_string = self.get_source_from_ast_range(node.lineno, node.end_lineno, node.col_offset, node.end_col_offset, strip=False) + # if "\n" in node_string: + # node_string = None + else: + node_string = None if len(self.scope) > 0: - return GNode(name=name, type=type, scope=copy(self.scope[-1])) + return GNode(name=name, type=type, scope=copy(self.scope[-1]), string=node_string) else: - return GNode(name=name, type=type) + return GNode(name=name, type=type, string=node_string) # return (node.__class__.__name__ + "_" + str(hex(int(time_ns()))), node.__class__.__name__) def get_edges(self, as_dataframe=True): @@ -170,12 +187,14 @@ def parse_body(self, nodes): for node in nodes: s = self.parse(node) if isinstance(s, tuple): + if s[1].type == "Constant": # this happens when processing docstring, as a result a lot of nodes are connected to the node Constant_ + continue # in general, constant node has no affect as a body expression, can skip # some parsers return edges and names? edges.extend(s[0]) if last_node is not None: - edges.append({"dst": s[1], "src": last_node, "type": "next"}) - edges.append({"dst": last_node, "src": s[1], "type": "prev"}) + edges.append({"dst": s[1], "src": last_node, "type": "next", "scope": copy(self.scope[-1])}) + edges.append({"dst": last_node, "src": s[1], "type": "prev", "scope": copy(self.scope[-1])}) last_node = s[1] @@ -278,9 +297,9 @@ def generic_parse(self, node, operands, with_name=None, ensure_iterables=False): else: node_name = with_name - if len(self.scope) > 0: - edges.append({"scope": copy(self.scope[-1]), "src": node_name, "dst": self.scope[-1], "type": "mention_scope"}) - edges.append({"scope": copy(self.scope[-1]), "src": self.scope[-1], "dst": node_name, "type": "mention_scope_rev"}) + # if len(self.scope) > 0: + # edges.append({"scope": copy(self.scope[-1]), "src": node_name, "dst": self.scope[-1], "type": "mention_scope"}) + # edges.append({"scope": copy(self.scope[-1]), "src": self.scope[-1], "dst": node_name, "type": "mention_scope_rev"}) for operand in operands: if operand in ["body", "orelse", "finalbody"]: @@ -414,6 +433,8 @@ def parse_ImportFrom(self, node): # # edges.extend(edges_from) # # edges.append({"scope": copy(self.scope[-1]), "src": name_from, "dst": name, "type": "module"}) # return edges, name + if node.module is not None: + node.module = ast.Name(node.module) return self.generic_parse(node, ["module", "names"]) def parse_Import(self, node): @@ -452,6 +473,10 @@ def parse_alias(self, node): # # # if node.asname: # # # edges.append({"scope": copy(self.scope[-1]), "src": name, "dst": node.asname, "type": "alias"}) # return edges, name + if node.name is not None: + node.name = ast.Name(node.name) + if node.asname is not None: + node.asname = ast.Name(node.asname) return self.generic_parse(node, ["name", "asname"]) def parse_arg(self, node): @@ -476,9 +501,11 @@ def parse_arg(self, node): ) annotation = GNode(name=annotation_string, type="type_annotation") - edges.append({"scope": copy(self.scope[-1]), "src": annotation, "dst": name, "type": 'annotation_for', "line": node.annotation.lineno-1, "end_line": node.annotation.end_lineno-1, "col_offset": node.annotation.col_offset, "end_col_offset": node.annotation.end_col_offset, "var_line": node.lineno-1, "var_end_line": node.end_lineno-1, "var_col_offset": node.col_offset, "var_end_col_offset": node.end_col_offset}) - # do not use reverse edges for types, will result in leak from function to function - # edges.append({"scope": copy(self.scope[-1]), "src": name, "dst": annotation, "type": 'annotation'}) + mention_name = GNode(name=node.arg + "@" + self.scope[-1].name, type="mention", scope=copy(self.scope[-1])) + edges.append({"scope": copy(self.scope[-1]), "src": annotation, "dst": mention_name, "type": 'annotation_for', "line": node.annotation.lineno-1, "end_line": node.annotation.end_lineno-1, "col_offset": node.annotation.col_offset, "end_col_offset": node.annotation.end_col_offset, "var_line": node.lineno-1, "var_end_line": node.end_lineno-1, "var_col_offset": node.col_offset, "var_end_col_offset": node.end_col_offset}) + # edges.append({"scope": copy(self.scope[-1]), "src": annotation, "dst": name, "type": 'annotation_for', "line": node.annotation.lineno-1, "end_line": node.annotation.end_lineno-1, "col_offset": node.annotation.col_offset, "end_col_offset": node.annotation.end_col_offset, "var_line": node.lineno-1, "var_end_line": node.end_lineno-1, "var_col_offset": node.col_offset, "var_end_col_offset": node.end_col_offset}) + # # do not use reverse edges for types, will result in leak from function to function + # # edges.append({"scope": copy(self.scope[-1]), "src": name, "dst": annotation, "type": 'annotation'}) return edges, name # return self.generic_parse(node, ["arg", "annotation"]) @@ -503,9 +530,15 @@ def parse_AnnAssign(self, node): annotation = GNode(name=annotation_string, type="type_annotation") edges, name = self.generic_parse(node, ["target"]) - edges.append({"scope": copy(self.scope[-1]), "src": annotation, "dst": name, "type": 'annotation_for', "line": node.annotation.lineno-1, "end_line": node.annotation.end_lineno-1, "col_offset": node.annotation.col_offset, "end_col_offset": node.annotation.end_col_offset, "var_line": node.lineno-1, "var_end_line": node.end_lineno-1, "var_col_offset": node.col_offset, "var_end_col_offset": node.end_col_offset}) - # do not use reverse edges for types, will result in leak from function to function - # edges.append({"scope": copy(self.scope[-1]), "src": name, "dst": annotation, "type": 'annotation'}) + try: + mention_name = GNode(name=node.target.id + "@" + self.scope[-1].name, type="mention", scope=copy(self.scope[-1])) + edges.append({"scope": copy(self.scope[-1]), "src": annotation, "dst": mention_name, "type": 'annotation_for', "line": node.annotation.lineno-1, "end_line": node.annotation.end_lineno-1, "col_offset": node.annotation.col_offset, "end_col_offset": node.annotation.end_col_offset, "var_line": node.lineno-1, "var_end_line": node.end_lineno-1, "var_col_offset": node.col_offset, "var_end_col_offset": node.end_col_offset}) + except Exception as e: + edges.append({"scope": copy(self.scope[-1]), "src": annotation, "dst": name, "type": 'annotation_for', "line": node.annotation.lineno-1, "end_line": node.annotation.end_lineno-1, "col_offset": node.annotation.col_offset, "end_col_offset": node.annotation.end_col_offset, "var_line": node.lineno-1, "var_end_line": node.end_lineno-1, "var_col_offset": node.col_offset, "var_end_col_offset": node.end_col_offset}) + # print(e) # don't know how I should parse this "Attribute(value=Name(id='self', ctx=Load()), attr='srctrlrpl_1631733463030025000@#attr#', ctx=Store())" + # edges.append({"scope": copy(self.scope[-1]), "src": annotation, "dst": name, "type": 'annotation_for', "line": node.annotation.lineno-1, "end_line": node.annotation.end_lineno-1, "col_offset": node.annotation.col_offset, "end_col_offset": node.annotation.end_col_offset, "var_line": node.lineno-1, "var_end_line": node.end_lineno-1, "var_col_offset": node.col_offset, "var_end_col_offset": node.end_col_offset}) + # # do not use reverse edges for types, will result in leak from function to function + # # edges.append({"scope": copy(self.scope[-1]), "src": name, "dst": annotation, "type": 'annotation'}) return edges, name # return self.generic_parse(node, ["target", "annotation"]) @@ -854,7 +887,7 @@ def parse_arguments(self, node): # arguments(args=[arg(arg='self', annotation=None), arg(arg='tqdm_cls', annotation=None), arg(arg='sleep_interval', annotation=None)], vararg=None, kwonlyargs=[], kw_defaults=[], kwarg=None, defaults=[]) # vararg constains type annotations - return self.generic_parse(node, ["args", "vararg"]) + return self.generic_parse(node, ["args", "vararg"]) # kwarg, kwonlyargs, posonlyargs??? def parse_comprehension(self, node): edges = [] @@ -862,9 +895,9 @@ def parse_comprehension(self, node): cph_name = self.get_name(name="comprehension", type="comprehension", add_random_identifier=True) # cph_name = GNode(name="comprehension_" + str(hex(int(time_ns()))), type="comprehension") - if len(self.scope) > 0: - edges.append({"scope": copy(self.scope[-1]), "src": cph_name, "dst": self.scope[-1], "type": "mention_scope"}) - edges.append({"scope": copy(self.scope[-1]), "src": self.scope[-1], "dst": cph_name, "type": "mention_scope_rev"}) + # if len(self.scope) > 0: + # edges.append({"scope": copy(self.scope[-1]), "src": cph_name, "dst": self.scope[-1], "type": "mention_scope"}) + # edges.append({"scope": copy(self.scope[-1]), "src": self.scope[-1], "dst": cph_name, "type": "mention_scope_rev"}) target, ext_edges = self.parse_operand(node.target) edges.extend(ext_edges) diff --git a/SourceCodeTools/code/ast/python_ast2.py b/SourceCodeTools/code/ast/python_ast2.py new file mode 100644 index 00000000..0c271c0a --- /dev/null +++ b/SourceCodeTools/code/ast/python_ast2.py @@ -0,0 +1,1005 @@ +import ast +import logging +from copy import copy +from enum import Enum +from itertools import chain +from pprint import pprint +from time import time_ns +from collections.abc import Iterable +import pandas as pd +# import os +from SourceCodeTools.code.IdentifierPool import IdentifierPool + + +class PythonNodeEdgeDefinitions: + ast_node_type_edges = { + "Assign": ["value", "targets"], + "AugAssign": ["target", "op", "value"], + "Import": ["names"], + "alias": ["name", "asname"], + "ImportFrom": ["module", "names"], + "Delete": ["targets"], + "Global": ["names"], + "Nonlocal": ["names"], + "withitem": ["context_expr", "optional_vars"], + "Subscript": ["value", "slice"], + "Slice": ["lower", "upper", "step"], + "ExtSlice": ["dims"], + "Index": ["value"], + "Starred": ["value"], + "Yield": ["value"], + "ExceptHandler": ["type"], + "Call": ["func", "args", "keywords"], + "Compare": ["left", "ops", "comparators"], + "BoolOp": ["values", "op"], + "Assert": ["test", "msg"], + "List": ["elts"], + "Tuple": ["elts"], + "Set": ["elts"], + "UnaryOp": ["operand", "op"], + "BinOp": ["left", "right", "op"], + "Await": ["value"], + "GeneratorExp": ["elt", "generators"], + "ListComp": ["elt", "generators"], + "SetComp": ["elt", "generators"], + "DictComp": ["key", "value", "generators"], + "Return": ["value"], + "Raise": ["exc", "cause"], + "YieldFrom": ["value"], + } + + overriden_node_type_edges = { + "Module": [], # overridden + "FunctionDef": ["function_name", "args", "decorator_list", "returned_by"], # overridden, `function_name` replaces `name`, `returned_by` replaces `returns` + "AsyncFunctionDef": ["function_name", "args", "decorator_list", "returned_by"], # overridden, `function_name` replaces `name`, `returned_by` replaces `returns` + "ClassDef": ["class_name"], # overridden, `class_name` replaces `name` + "AnnAssign": ["target", "value", "annotation_for"], # overridden, `annotation_for` replaces `annotation` + "With": ["items"], # overridden + "AsyncWith": ["items"], # overridden + "arg": ["arg", "annotation_for"], # overridden, `annotation_for` is custom + "Lambda": [], # overridden + "IfExp": ["test", "if_true", "if_false"], # overridden, `if_true` renamed from `body`, `if_false` renamed from `orelse` + "keyword": ["arg", "value"], # overridden + "Attribute": ["value", "attr"], # overridden + "Num": [], # overridden + "Str": [], # overridden + "Bytes": [], # overridden + "If": ["test"], # overridden + "For": ["target", "iter"], # overridden + "AsyncFor": ["target", "iter"], # overridden + "Try": [], # overridden + "While": [], # overridden + "Expr": ["value"], # overridden + "Dict": ["keys", "values"], # overridden + "JoinedStr": [], # overridden + "FormattedValue": ["value"], # overridden + "arguments": ["args", "vararg", "kwarg", "kwonlyargs", "posonlyargs"], # overridden + "comprehension": ["target", "iter", "ifs"], # overridden, `target_for` is custom, `iter_for` is customm `ifs_rev` is custom + } + + context_edge_names = { + "Module": ["defined_in_module"], + "FunctionDef": ["defined_in_function"], + "ClassDef": ["defined_in_class"], + "With": ["executed_inside_with"], + "AsyncWith": ["executed_inside_with"], + "If": ["executed_if_true", "executed_if_false"], + "For": ["executed_in_for", "executed_in_for_orelse"], + "AsyncFor": ["executed_in_for", "executed_in_for_orelse"], + "While": ["executed_in_while", "executed_while_true"], + "Try": ["executed_in_try", "executed_in_try_final", "executed_in_try_else", "executed_in_try_except", "executed_with_try_handler"], + } + + extra_edge_types = { + "control_flow", "next", "local_mention", + } + + # exceptions needed when we do not want to filter some edge types using a simple rule `_rev` + reverse_edge_exceptions = { + # "target": "target_for", + # "iter": "iter_for", # mainly used in comprehension + # "ifs": "ifs_for", # mainly used in comprehension + "next": "prev", + "local_mention": None, # from name to variable mention + "returned_by": None, # for type annotations + "annotation_for": None, # for type annotations + "control_flow": None, # for control flow + "op": None, # for operations + "attr": None, # for attributes + # "arg": None # for keywords ??? + } + + iterable_nodes = { # parse_iterable + "List", "Tuple", "Set" + } + + named_nodes = { + "Name", "NameConstant" # parse_name + } + + constant_nodes = { + "Constant" # parse_Constant + } + + operand_nodes = { # parse_op_name + "And", "Or", "Not", "Is", "Gt", "Lt", "GtE", "LtE", "Eq", "NotEq", "Ellipsis", "Add", "Mod", + "Sub", "UAdd", "USub", "Div", "Mult", "MatMult", "Pow", "FloorDiv", "RShift", "LShift", "BitAnd", + "BitOr", "BitXor", "IsNot", "NotIn", "In", "Invert" + } + + control_flow_nodes = { # parse_control_flow + "Continue", "Break", "Pass" + } + + # extra node types exist for keywords and attributes to prevent them from + # getting mixed with local variable mentions + extra_node_types = { + "#keyword#", + "#attr#" + } + + @classmethod + def regular_node_types(cls): + return set(cls.ast_node_type_edges.keys()) + + @classmethod + def overridden_node_types(cls): + return set(cls.overriden_node_type_edges.keys()) + + @classmethod + def node_types(cls): + return list( + cls.regular_node_types() | + cls.overridden_node_types() | + cls.iterable_nodes | cls.named_nodes | cls.constant_nodes | + cls.operand_nodes | cls.control_flow_nodes | cls.extra_node_types + ) + + @classmethod + def scope_edges(cls): + return set(map(lambda x: x, chain(*cls.context_edge_names.values()))) # "defined_in_" + + + @classmethod + def auxiliary_edges(cls): + direct_edges = cls.scope_edges() | cls.extra_edge_types + reverse_edges = cls.compute_reverse_edges(direct_edges) + return direct_edges | reverse_edges + + @classmethod + def compute_reverse_edges(cls, direct_edges): + reverse_edges = set() + for edge in direct_edges: + if edge in cls.reverse_edge_exceptions: + reverse = cls.reverse_edge_exceptions[edge] + if reverse is not None: + reverse_edges.add(reverse) + else: + reverse_edges.add(edge + "_rev") + return reverse_edges + + @classmethod + def edge_types(cls): + direct_edges = list( + set(chain(*cls.ast_node_type_edges.values())) | + set(chain(*cls.overriden_node_type_edges.values())) | + cls.scope_edges() | + cls.extra_edge_types | cls.named_nodes | cls.constant_nodes | + cls.operand_nodes | cls.control_flow_nodes | cls.extra_node_types + ) + + reverse_edges = list(cls.compute_reverse_edges(direct_edges)) + return direct_edges + reverse_edges + + +PythonAstNodeTypes = Enum( + "PythonAstNodeTypes", + " ".join( + PythonNodeEdgeDefinitions.node_types() + ) +) + + +PythonAstEdgeTypes = Enum( + "PythonAstEdgeTypes", + " ".join( + PythonNodeEdgeDefinitions.edge_types() + ) +) + + +class PythonCodeExamplesForNodes: + examples = { + "FunctionDef": + "def f(a):\n" + " return a\n", + "ClassDef": + "class C:\n" + " def m():\n" + " pass\n", + "AnnAssign": "a: int = 5\n", + "With": + "with open(a) as b:\n" + " do_stuff(b)\n", + "arg": + "def f(a: int = 5):\n" + " return a\n", + "Lambda": "lambda x: x + 3\n", + "IfExp": "a = 5 if True else 0\n", + "keyword": "fn(a=5, b=4)\n", + "Attribute": "a.b.c\n", + "If": + "if d is True:\n" + " a = b\n" + "else:\n" + " a = c\n", + "For": + "for i in list:\n" + " k = fn(i)\n" + " if k == 4:\n" + " fn2(k)\n" + " break\n" + "else:\n" + " fn2(0)\n", + "Try": + "try:\n" + " a = b\n" + "except Exception as e:\n" + " a = c\n" + "else:\n" + " a = d\n" + "finally:\n" + " print(a)\n", + "While": + "while b = c:\n" + " do_iter(b)\n", + "Dict": "{a:b, c:d}\n", + "comprehension": "[i for i in list if i != 5]\n", + "BinOp": "c = a + b\n", + "ImportFrom": "from module import Class\n", + "alias": "import module as m\n", + "List": "a = [1, 2, 3, 4]\n" + } + + +def generate_available_edges(): + node_types = PythonNodeEdgeDefinitions.node_types() + for nt in sorted(node_types): + if hasattr(ast, nt): + fl = sorted(getattr(ast, nt)._fields) + if len(fl) == 0: + print(nt, ) + else: + for f in fl: + print(nt, f, sep=" ") + + +def generate_utilized_edges(): + d = dict() + d.update(PythonNodeEdgeDefinitions.ast_node_type_edges) + d.update(PythonNodeEdgeDefinitions.overriden_node_type_edges) + for nt in sorted(d.keys()): + if hasattr(ast, nt): + fl = sorted(d[nt]) + if len(fl) == 0: + print(nt, ) + else: + for f in fl: + print(nt, f, sep=" ") + + +class PythonSharedNodes: + annotation_types = {"type_annotation", "returned_by"} + tokenizable_types = {"Name", "#attr#", "#keyword#"} + python_token_types = {"Op", "Constant", "JoinedStr", "CtlFlow", "ast_Literal"} + subword_types = {'subword'} + + subword_leaf_types = annotation_types | subword_types | python_token_types + named_leaf_types = annotation_types | tokenizable_types | python_token_types + tokenizable_types_and_annotations = annotation_types | tokenizable_types + + shared_node_types = annotation_types | subword_types | tokenizable_types | python_token_types + + # leaf_types = {'subword', "Op", "Constant", "JoinedStr", "CtlFlow", "ast_Literal", "Name", "type_annotation", "returned_by"} + # shared_node_types = {'subword', "Op", "Constant", "JoinedStr", "CtlFlow", "ast_Literal", "Name", "type_annotation", "returned_by", "#attr#", "#keyword#"} + + @classmethod + def is_shared(cls, node): + # nodes that are of stared type + # nodes that are subwords of keyword arguments + return cls.is_shared_name_type(node.name, node.type) + + @classmethod + def is_shared_name_type(cls, name, type): + if type in cls.shared_node_types or \ + (type == "subword_instance" and "0x" not in name): + return True + return False + + +class GNode: + def __init__(self, **kwargs): + self.string = None + for k, v in kwargs.items(): + setattr(self, k, v) + + def __eq__(self, other): + return self.name == other.name and self.type == other.type + + def __repr__(self): + return self.__dict__.__repr__() + + def __hash__(self): + return (self.name, self.type).__hash__() + + def setprop(self, key, value): + setattr(self, key, value) + + +class GEdge: + def __init__(self, src, dst, type, scope=None, line=None, end_line=None, col_offset=None, end_col_offset=None): + self.src = src + self.dst = dst + self.type = type + self.line = line + self.scope = scope + self.end_line = end_line + self.col_offset = col_offset + self.end_col_offset = end_col_offset + + def __getitem__(self, item): + return self.__dict__[item] + + +class AstGraphGenerator(object): + + def __init__(self, source, add_reverse_edges=True): + self.source = source.split("\n") # lines of the source code + self.root = ast.parse(source) + self.current_condition = [] + self.condition_status = [] + self.scope = [] + self._add_reverse_edges = add_reverse_edges + + self._identifier_pool = IdentifierPool() + + def get_source_from_ast_range(self, node, strip=True): + start_line = node.lineno + end_line = node.end_lineno + start_col = node.col_offset + end_col = node.end_col_offset + + source = "" + num_lines = end_line - start_line + 1 + if start_line == end_line: + section = self.source[start_line - 1].encode("utf8")[start_col:end_col].decode( + "utf8") + source += section.strip() if strip else section + "\n" + else: + for ind, lineno in enumerate(range(start_line - 1, end_line)): + if ind == 0: + section = self.source[lineno].encode("utf8")[start_col:].decode( + "utf8") + source += section.strip() if strip else section + "\n" + elif ind == num_lines - 1: + section = self.source[lineno].encode("utf8")[:end_col].decode( + "utf8") + source += section.strip() if strip else section + "\n" + else: + section = self.source[lineno] + source += section.strip() if strip else section + "\n" + + return source.rstrip() + + def get_name(self, *, node=None, name=None, type=None, add_random_identifier=False): + + random_identifier = self._identifier_pool.get_new_identifier() + + if node is not None: + name = f"{node.__class__.__name__}_{random_identifier}" + type = node.__class__.__name__ + else: + if add_random_identifier: + name = f"{name}_{random_identifier}" + + if hasattr(node, "lineno"): + node_string = self.get_source_from_ast_range(node, strip=False) + # if "\n" in node_string: + # node_string = None + else: + node_string = None + + if len(self.scope) > 0: + return GNode(name=name, type=type, scope=copy(self.scope[-1]), string=node_string) + else: + return GNode(name=name, type=type, string=node_string) + # return (node.__class__.__name__ + "_" + str(hex(int(time_ns()))), node.__class__.__name__) + + def get_edges(self, as_dataframe=True): + edges = [] + for f_def_node in ast.iter_child_nodes(self.root): + if type(f_def_node) == ast.FunctionDef: + edges.extend(self.parse(f_def_node)) + break # to avoid going through nested definitions + + if not as_dataframe: + return edges + df = pd.DataFrame(edges) + return df.astype({col: "Int32" for col in df.columns if col not in {"src", "dst", "type"}}) + + def parse(self, node): + n_type = type(node).__name__ + if n_type in PythonNodeEdgeDefinitions.ast_node_type_edges: + return self.generic_parse( + node, + PythonNodeEdgeDefinitions.ast_node_type_edges[n_type] + ) + elif n_type in PythonNodeEdgeDefinitions.overriden_node_type_edges: + method_name = "parse_" + n_type + return self.__getattribute__(method_name)(node) + elif n_type in PythonNodeEdgeDefinitions.iterable_nodes: + return self.parse_iterable(node) + elif n_type in PythonNodeEdgeDefinitions.named_nodes: + return self.parse_name(node) + elif n_type in PythonNodeEdgeDefinitions.constant_nodes: + return self.parse_Constant(node) + elif n_type in PythonNodeEdgeDefinitions.operand_nodes: + return self.parse_op_name(node) + elif n_type in PythonNodeEdgeDefinitions.control_flow_nodes: + return self.parse_control_flow(node) + else: + print(type(node)) + print(ast.dump(node)) + print(node._fields) + pprint(self.source) + return self.generic_parse(node, node._fields) + # raise Exception() + # return [type(node)] + + def add_edge( + self, edges, src, dst, type, scope=None, + position_node=None, var_position_node=None + ): + edges.append({ + "src": src, "dst": dst, "type": type, "scope": scope, + }) + + def get_positions(node): + if node is not None and hasattr(node, "lineno"): + line = node.lineno-1 + end_line = node.end_lineno - 1 + col_offset = node.col_offset + end_col_offset = node.end_col_offset + else: + line = end_line = col_offset = end_col_offset = None + return line, end_line, col_offset, end_col_offset + + line, end_line, col_offset, end_col_offset = get_positions(position_node) + + if line is not None: + edges[-1].update({ + "line": line, "end_line": end_line, "col_offset": col_offset, "end_col_offset": end_col_offset + }) + + var_line, var_end_line, var_col_offset, var_end_col_offset = get_positions(var_position_node) + + if var_line is not None: + edges[-1].update({ + "var_line": var_line, "var_end_line": var_end_line, "var_col_offset": var_col_offset, "var_end_col_offset": var_end_col_offset + }) + + reverse_type = PythonNodeEdgeDefinitions.reverse_edge_exceptions.get(type, type + "_rev") + if self._add_reverse_edges is True and reverse_type is not None: + edges.append({ + "src": dst, "dst": src, "type": reverse_type, "scope": scope + }) + + def parse_body(self, nodes): + edges = [] + last_node = None + for node in nodes: + s = self.parse(node) + if isinstance(s, tuple): + if s[1].type == "Constant": # this happens when processing docstring, as a result a lot of nodes are connected to the node Constant_ + continue # in general, constant node has no affect as a body expression, can skip + # some parsers return edges and names? + edges.extend(s[0]) + + if last_node is not None: + self.add_edge(edges, src=last_node, dst=s[1], type="next", scope=self.scope[-1]) + + last_node = s[1] + + for cond_name, cond_stat in zip(self.current_condition[-1:], self.condition_status[-1:]): + self.add_edge(edges, src=last_node, dst=cond_name, type=cond_stat, scope=self.scope[-1]) # "defined_in_" + + else: + edges.extend(s) + return edges + + def parse_in_context(self, cond_name, cond_stat, edges, body): + if isinstance(cond_name, str): + cond_name = [cond_name] + cond_stat = [cond_stat] + elif isinstance(cond_name, GNode): + cond_name = [cond_name] + cond_stat = [cond_stat] + + for cn, cs in zip(cond_name, cond_stat): + self.current_condition.append(cn) + self.condition_status.append(cs) + + edges.extend(self.parse_body(body)) + + for i in range(len(cond_name)): + self.current_condition.pop(-1) + self.condition_status.pop(-1) + + def parse_as_mention(self, name): + mention_name = GNode(name=name + "@" + self.scope[-1].name, type="mention", scope=copy(self.scope[-1])) + name = GNode(name=name, type="Name") + # mention_name = (name + "@" + self.scope[-1], "mention") + + # edge from name to mention in a function + edges = [] + self.add_edge(edges, src=name, dst=mention_name, type="local_mention", scope=self.scope[-1]) + return edges, mention_name + + def parse_operand(self, node): + # need to make sure upper level name is correct; handle @keyword; type placeholder for sourcetrail nodes? + edges = [] + if isinstance(node, str): + # fall here when parsing attributes, they are given as strings; should attributes be parsed into subwords? + if "@" in node: + parts = node.split("@") + node = GNode(name=parts[0], type=parts[1]) + else: + # node = GNode(name=node, type="Name") + node = ast.Name(node) + edges_, node = self.parse(node) + edges.extend(edges_) + iter_ = node + elif isinstance(node, int) or node is None: + iter_ = GNode(name=str(node), type="ast_Literal") + # iter_ = str(node) + elif isinstance(node, GNode): + iter_ = node + else: + iter_e = self.parse(node) + if type(iter_e) == str: + iter_ = iter_e + elif isinstance(iter_e, GNode): + iter_ = iter_e + elif type(iter_e) == tuple: + ext_edges, name = iter_e + assert isinstance(name, GNode) + edges.extend(ext_edges) + iter_ = name + else: + # unexpected scenario + print(node) + print(ast.dump(node)) + print(iter_e) + print(type(iter_e)) + pprint(self.source) + print(self.source[node.lineno - 1].strip()) + raise Exception() + + return iter_, edges + + def parse_and_add_operand(self, node_name, operand, type, edges): + + operand_name, ext_edges = self.parse_operand(operand) + edges.extend(ext_edges) + + self.add_edge( + edges, src=operand_name, dst=node_name, type=type, scope=self.scope[-1], + position_node=operand + ) + + def generic_parse(self, node, operands, with_name=None, ensure_iterables=False): + + edges = [] + + if with_name is None: + node_name = self.get_name(node=node) + else: + node_name = with_name + + for operand in operands: + if operand in ["body", "orelse", "finalbody"]: + logging.warning(f"Not clear which node is processed here {ast.dump(node)}") + self.parse_in_context(node_name, operand, edges, node.__getattribute__(operand)) + else: + operand_ = node.__getattribute__(operand) + if operand_ is not None: + if isinstance(operand_, Iterable) and not isinstance(operand_, str): + # TODO: + # appears as leaf node if the iterable is empty. suggest adding an "EMPTY" element + for oper_ in operand_: + self.parse_and_add_operand(node_name, oper_, operand, edges) + else: + self.parse_and_add_operand(node_name, operand_, operand, edges) + + # TODO + # need to identify the benefit of this node + # maybe it is better to use node types in the graph + # edges.append({"scope": copy(self.scope[-1]), "src": node.__class__.__name__, "dst": node_name, "type": "node_type"}) + + return edges, node_name + + def parse_type_node(self, node): + if node.lineno == node.end_lineno: + type_str = self.source[node.lineno][node.col_offset - 1: node.end_col_offset] + else: + type_str = "" + for ln in range(node.lineno - 1, node.end_lineno): + if ln == node.lineno - 1: + type_str += self.source[ln][node.col_offset - 1:].strip() + elif ln == node.end_lineno - 1: + type_str += self.source[ln][:node.end_col_offset].strip() + else: + type_str += self.source[ln].strip() + return type_str + + def parse_Module(self, node): + edges, module_name = self.generic_parse(node, []) + self.scope.append(module_name) + self.parse_in_context(module_name, "defined_in_module", edges, node.body) + self.scope.pop(-1) + return edges, module_name + + def parse_FunctionDef(self, node): + # need to create function name before generic_parse so that the scope is set up correctly + # scope is used to create local mentions of variable and function names + fdef_node_name = self.get_name(node=node) + self.scope.append(fdef_node_name) + + to_parse = [] + if len(node.args.args) > 0 or node.args.vararg is not None: + to_parse.append("args") + if len("decorator_list") > 0: + to_parse.append("decorator_list") + + edges, f_name = self.generic_parse(node, to_parse, with_name=fdef_node_name) + + if node.returns is not None: + # returns stores return type annotation + # can contain quotes + # https://stackoverflow.com/questions/46458470/should-you-put-quotes-around-type-annotations-in-python + # https://www.python.org/dev/peps/pep-0484/#forward-references + annotation_string = self.get_source_from_ast_range(node.returns) + annotation = GNode(name=annotation_string, + type="type_annotation") + self.add_edge( + edges, src=annotation, dst=f_name, type="returned_by", scope=self.scope[-1], + position_node=node.returns + ) + + self.parse_in_context(f_name, "defined_in_function", edges, node.body) + + self.scope.pop(-1) + + ext_edges, func_name = self.parse_as_mention(name=node.name) + edges.extend(ext_edges) + + assert isinstance(node.name, str) + self.add_edge( + edges, src=f_name, dst=func_name, type="function_name", scope=self.scope[-1], + ) + + return edges, f_name + + def parse_AsyncFunctionDef(self, node): + return self.parse_FunctionDef(node) + + def parse_ClassDef(self, node): + + edges, class_node_name = self.generic_parse(node, []) + + self.scope.append(class_node_name) + self.parse_in_context(class_node_name, "defined_in_class", edges, node.body) + self.scope.pop(-1) + + ext_edges, cls_name = self.parse_as_mention(name=node.name) + edges.extend(ext_edges) + self.add_edge( + edges, src=class_node_name, dst=cls_name, type="class_name", scope=self.scope[-1], + ) + + return edges, class_node_name + + def parse_With(self, node): + edges, with_name = self.generic_parse(node, ["items"]) + + self.parse_in_context(with_name, "executed_inside_with", edges, node.body) + + return edges, with_name + + def parse_AsyncWith(self, node): + return self.parse_With(node) + + def parse_arg(self, node): + # node.annotation stores type annotation + # if node.annotation: + # print(self.source[node.lineno-1]) # can get definition string here + # print(node.arg) + + # # included mention + name = self.get_name(node=node) + edges, mention_name = self.parse_as_mention(node.arg) + self.add_edge( + edges, src=mention_name, dst=name, type="arg", scope=self.scope[-1], + ) + + if node.annotation is not None: + # can contain quotes + # https://stackoverflow.com/questions/46458470/should-you-put-quotes-around-type-annotations-in-python + # https://www.python.org/dev/peps/pep-0484/#forward-references + annotation_string = self.get_source_from_ast_range(node.annotation) + annotation = GNode(name=annotation_string, + type="type_annotation") + mention_name = GNode(name=node.arg + "@" + self.scope[-1].name, type="mention", scope=copy(self.scope[-1])) + self.add_edge( + edges, src=annotation, dst=mention_name, type="annotation_for", scope=self.scope[-1], + position_node=node.annotation, var_position_node=node + ) + return edges, name + + def parse_AnnAssign(self, node): + # stores annotation information for variables + # + # paths: List[Path] = [] + # AnnAssign(target=Name(id='paths', ctx=Store()), annotation=Subscript(value=Name(id='List', ctx=Load()), + # slice=Index(value=Name(id='Path', ctx=Load())), + # ctx=Load()), value=List(elts=[], ctx=Load()), simple=1) + + # TODO + # parse value?? + + # can contain quotes + # https://stackoverflow.com/questions/46458470/should-you-put-quotes-around-type-annotations-in-python + # https://www.python.org/dev/peps/pep-0484/#forward-references + annotation_string = self.get_source_from_ast_range(node.annotation) + annotation = GNode(name=annotation_string, + type="type_annotation") + edges, name = self.generic_parse(node, ["target", "value"]) + try: + mention_name = GNode(name=node.target.id + "@" + self.scope[-1].name, type="mention", scope=copy(self.scope[-1])) + except Exception as e: + mention_name = name + + self.add_edge( + edges, src=annotation, dst=mention_name, type="annotation_for", scope=self.scope[-1], + position_node=node.annotation, var_position_node=node + ) + return edges, name + + def parse_Lambda(self, node): + # this is too ambiguous + edges, lmb_name = self.generic_parse(node, []) + self.parse_and_add_operand(lmb_name, node.body, "lambda", edges) + + return edges, lmb_name + + def parse_IfExp(self, node): + edges, ifexp_name = self.generic_parse(node, ["test"]) + self.parse_and_add_operand(ifexp_name, node.body, "if_true", edges) + self.parse_and_add_operand(ifexp_name, node.orelse, "if_false", edges) + return edges, ifexp_name + + def parse_ExceptHandler(self, node): + # have missing fields. example: + # not parsing "name" field + # except handler is unique for every function + return self.generic_parse(node, ["type"]) + + def parse_keyword(self, node): + if isinstance(node.arg, str): + # change arg name so that it does not mix with variable names + node.arg += "@#keyword#" + return self.generic_parse(node, ["arg", "value"]) + else: + return self.generic_parse(node, ["value"]) + + def parse_name(self, node): + edges = [] + # if type(node) == ast.Attribute: + # left, ext_edges = self.parse(node.value) + # right = node.attr + # return self.parse(node.value) + "___" + node.attr + if type(node) == ast.Name: + return self.parse_as_mention(str(node.id)) + elif type(node) == ast.NameConstant: + return GNode(name=str(node.value), type="NameConstant") + + def parse_Attribute(self, node): + if node.attr is not None: + # change attr name so that it does not mix with variable names + node.attr += "@#attr#" + return self.generic_parse(node, ["value", "attr"]) + + def parse_Constant(self, node): + # TODO + # decide whether this name should be unique or not + name = GNode(name="Constant_", type="Constant") + # name = "Constant_" + # if node.kind is not None: + # name += "" + return name + + def parse_op_name(self, node): + return GNode(name=node.__class__.__name__, type="Op") + # return node.__class__.__name__ + + def parse_Num(self, node): + return str(node.n) + + def parse_Str(self, node): + return self.generic_parse(node, []) + # return node.s + + def parse_Bytes(self, node): + return repr(node.s) + + def parse_If(self, node): + + edges, if_name = self.generic_parse(node, ["test"]) + + self.parse_in_context(if_name, "executed_if_true", edges, node.body) + self.parse_in_context(if_name, "executed_if_false", edges, node.orelse) + + return edges, if_name + + def parse_For(self, node): + + edges, for_name = self.generic_parse(node, ["target", "iter"]) + + self.parse_in_context(for_name, "executed_in_for", edges, node.body) + self.parse_in_context(for_name, "executed_in_for_orelse", edges, node.orelse) + + return edges, for_name + + def parse_AsyncFor(self, node): + return self.parse_For(node) + + def parse_Try(self, node): + + edges, try_name = self.generic_parse(node, []) + + self.parse_in_context(try_name, "executed_in_try", edges, node.body) + + for h in node.handlers: + + handler_name, ext_edges = self.parse_operand(h) + edges.extend(ext_edges) + self.parse_in_context([try_name, handler_name], ["executed_in_try_except", "executed_with_try_handler"], edges, h.body) + + self.parse_in_context(try_name, "executed_in_try_final", edges, node.finalbody) + self.parse_in_context(try_name, "executed_in_try_else", edges, node.orelse) + + return edges, try_name + + def parse_While(self, node): + + edges, while_name = self.generic_parse(node, []) + + cond_name, ext_edges = self.parse_operand(node.test) + edges.extend(ext_edges) + + self.parse_in_context([while_name, cond_name], ["executed_in_while", "executed_while_true"], edges, node.body) + + return edges, while_name + + # def parse_Compare(self, node): + # return self.generic_parse(node, ["left", "ops", "comparators"]) + # + # def parse_BoolOp(self, node): + # return self.generic_parse(node, ["values", "op"]) + + def parse_Expr(self, node): + edges = [] + expr_name, ext_edges = self.parse_operand(node.value) + edges.extend(ext_edges) + + return edges, expr_name + + def parse_control_flow(self, node): + edges = [] + ctrlflow_name = self.get_name(name="ctrl_flow", type="CtlFlowInstance", add_random_identifier=True) + self.add_edge( + edges, + src=GNode(name=node.__class__.__name__, type="CtlFlow"), dst=ctrlflow_name, + type="control_flow", scope=self.scope[-1] + ) + + return edges, ctrlflow_name + + def parse_iterable(self, node): + return self.generic_parse(node, ["elts"], ensure_iterables=True) + + def parse_Dict(self, node): + return self.generic_parse(node, ["keys", "values"], ensure_iterables=True) + + def parse_JoinedStr(self, node): + joinedstr_name = GNode(name="JoinedStr_", type="JoinedStr") + return [], joinedstr_name + # return self.generic_parse(node, []) + # return self.generic_parse(node, ["values"]) + + def parse_FormattedValue(self, node): + # have missing fields. example: + # FormattedValue(value=Subscript(value=Name(id='args', ctx=Load()), slice=Index(value=Num(n=0)), ctx=Load()),conversion=-1, format_spec=None) + # 'conversion', 'format_spec' not parsed + return self.generic_parse(node, ["value"]) + + def parse_arguments(self, node): + # have missing fields. example: + # arguments(args=[arg(arg='self', annotation=None), arg(arg='tqdm_cls', annotation=None), arg(arg='sleep_interval', annotation=None)], vararg=None, kwonlyargs=[], kw_defaults=[], kwarg=None, defaults=[]) + + # vararg constains type annotations + return self.generic_parse(node, ["args", "vararg", "kwarg", "kwonlyargs", "posonlyargs"]) + + def parse_comprehension(self, node): + edges = [] + + cph_name = self.get_name(name="comprehension", type="comprehension", add_random_identifier=True) + + target, ext_edges = self.parse_operand(node.target) + edges.extend(ext_edges) + + self.add_edge( + edges, src=target, dst=cph_name, type="target", scope=self.scope[-1], + position_node=node.target + ) + + iter_, ext_edges = self.parse_operand(node.iter) + edges.extend(ext_edges) + + self.add_edge( + edges, src=iter_, dst=cph_name, type="iter", scope=self.scope[-1], + position_node=node.iter + ) + + for if_ in node.ifs: + if_n, ext_edges = self.parse_operand(if_) + edges.extend(ext_edges) + self.add_edge( + edges, src=if_n, dst=cph_name, type="ifs", scope=self.scope[-1], + ) + + return edges, cph_name + + # def parse_alias(self, node): + # if isinstance(node.name, str): + # node.name = ast.Name(node.name) + # if isinstance(node.asname, str): + # node.asname = ast.Name(node.asname) + # return self.generic_parse(node, ["name", "asname"]) + # + # def parse_ImportFrom(self, node): + # if node.module is not None: + # node.module = ast.Name(node.module) + # return self.generic_parse(node, ["module", "names"]) + +if __name__ == "__main__": + c = "def f(a=5): f(a=4)" + g = AstGraphGenerator(c.lstrip()) + g.parse(g.root) + # import sys + # f_bodies = pd.read_csv(sys.argv[1]) + # failed = 0 + # for ind, c in enumerate(f_bodies['body_normalized']): + # if isinstance(c, str): + # try: + # g = AstGraphGenerator(c.lstrip()) + # except SyntaxError as e: + # print(e) + # continue + # failed += 1 + # edges = g.get_edges() + # # edges.to_csv(os.path.join(os.path.dirname(sys.argv[1]), "body_edges.csv"), mode="a", index=False, header=(ind==0)) + # print("\r%d/%d" % (ind, len(f_bodies['normalized_body'])), end="") + # else: + # print("Skipped not a string") + # + # print(" " * 30, end="\r") + # print(failed, len(f_bodies['normalized_body'])) diff --git a/SourceCodeTools/code/ast/python_ast_cf.py b/SourceCodeTools/code/ast/python_ast_cf.py new file mode 100644 index 00000000..64e63ba4 --- /dev/null +++ b/SourceCodeTools/code/ast/python_ast_cf.py @@ -0,0 +1,509 @@ +import ast +from copy import copy +from pprint import pprint +from time import time_ns +from collections.abc import Iterable +import pandas as pd + +from SourceCodeTools.code.annotator_utils import to_offsets + + +class GNode: + # name = None + # type = None + # id = None + + def __init__(self, **kwargs): + self.string = None + for k, v in kwargs.items(): + setattr(self, k, v) + + def __eq__(self, other): + if self.name == other.name and self.type == other.type: + return True + else: + return False + + def __repr__(self): + return self.__dict__.__repr__() + + def __hash__(self): + return (self.name, self.type).__hash__() + + def setprop(self, key, value): + setattr(self, key, value) + + +class AstGraphGenerator(object): + + def __init__(self, source): + self.source = source.split("\n") # lines of the source code + self.full_source = source + self.root = ast.parse(source) + self.current_condition = [] + self.condition_status = [] + self.scope = [] + + def get_source_from_ast_range(self, start_line, end_line, start_col, end_col): + source = "" + num_lines = end_line - start_line + 1 + if start_line == end_line: + source += self.source[start_line - 1].encode("utf8")[start_col:end_col].decode( + "utf8").strip() + else: + for ind, lineno in enumerate(range(start_line - 1, end_line)): + if ind == 0: + source += self.source[lineno].encode("utf8")[start_col:].decode( + "utf8").strip() + elif ind == num_lines - 1: + source += self.source[lineno].encode("utf8")[:end_col].decode( + "utf8").strip() + else: + source += self.source[lineno].strip() + + return source + + def get_name(self, *, node=None, name=None, type=None, add_random_identifier=False): + + if node is not None: + name = node.__class__.__name__ + "_" + str(hex(int(time_ns()))) + type = node.__class__.__name__ + else: + if add_random_identifier: + name += f"_{str(hex(int(time_ns())))}" + + if len(self.scope) > 0: + return GNode(name=name, type=type, scope=copy(self.scope[-1])) + else: + return GNode(name=name, type=type) + # return (node.__class__.__name__ + "_" + str(hex(int(time_ns()))), node.__class__.__name__) + + def get_edges(self, as_dataframe=False): + edges = [] + edges.extend(self.parse(self.root)[0]) + # for f_def_node in ast.iter_child_nodes(self.root): + # if type(f_def_node) == ast.FunctionDef: + # edges.extend(self.parse(f_def_node)) + # break # to avoid going through nested definitions + + if not as_dataframe: + return edges + df = pd.DataFrame(edges) + return df.astype({col: "Int32" for col in df.columns if col not in {"src", "dst", "type"}}) + + def parse(self, node): + n_type = type(node) + method_name = "parse_" + n_type.__name__ + if hasattr(self, method_name): + return self.__getattribute__(method_name)(node) + else: + # print(type(node)) + # print(ast.dump(node)) + # print(node._fields) + # pprint(self.source) + return self.parse_as_expression(node) + # return self.generic_parse(node, node._fields) + # raise Exception() + # return [type(node)] + + def parse_body(self, nodes): + edges = [] + last_node = None + for node in nodes: + s = self.parse(node) + if isinstance(s, tuple): + # some parsers return edges and names? + edges.extend(s[0]) + + if last_node is not None: + edges.append({"dst": s[1], "src": last_node, "type": "next"}) + edges.append({"dst": last_node, "src": s[1], "type": "prev"}) + + last_node = s[1] + + for cond_name, cond_stat in zip(self.current_condition[-1:], self.condition_status[-1:]): + edges.append({"scope": copy(self.scope[-1]), "src": last_node, "dst": cond_name, "type": "defined_in_" + cond_stat}) + edges.append({"scope": copy(self.scope[-1]), "src": cond_name, "dst": last_node, "type": "defined_in_" + cond_stat + "_rev"}) + # edges.append({"scope": copy(self.scope[-1]), "src": cond_name, "dst": last_node, "type": "execute_when_" + cond_stat}) + else: + edges.extend(s) + return edges + + def parse_in_context(self, cond_name, cond_stat, edges, body): + if isinstance(cond_name, str): + cond_name = [cond_name] + cond_stat = [cond_stat] + elif isinstance(cond_name, GNode): + cond_name = [cond_name] + cond_stat = [cond_stat] + + for cn, cs in zip(cond_name, cond_stat): + self.current_condition.append(cn) + self.condition_status.append(cs) + + edges.extend(self.parse_body(body)) + + for i in range(len(cond_name)): + self.current_condition.pop(-1) + self.condition_status.pop(-1) + + def parse_as_expression(self, node, *args, **kwargs): + offset = to_offsets(self.full_source, + [(node.lineno - 1, node.end_lineno - 1, node.col_offset, node.end_col_offset, "expression")], + as_bytes=True) + offset, = offset + line = self.full_source[offset[0]: offset[1]].replace("@","##at##") + name = GNode(name=line, type="Name") + # expr = GNode(name="Expression" + "_" + str(hex(int(time_ns()))), type="mention") + expr = GNode(name=f"{line}@{self.scope[-1].name}", type="mention") + edges = [ + {"scope": copy(self.scope[-1]), "src": name, "dst": expr, "type": "local_mention", "line": node.lineno - 1, "end_line": node.end_lineno - 1, "col_offset": node.col_offset, "end_col_offset": node.end_col_offset}, + ] + + return edges, expr + + def parse_as_mention(self, name): + mention_name = GNode(name=name + "@" + self.scope[-1].name, type="mention", scope=copy(self.scope[-1])) + name = GNode(name=name, type="Name") + # mention_name = (name + "@" + self.scope[-1], "mention") + + # edge from name to mention in a function + edges = [ + {"scope": copy(self.scope[-1]), "src": name, "dst": mention_name, "type": "local_mention"}, + # {"scope": copy(self.scope[-1]), "src": self.scope[-1], "dst": mention_name, "type": "mention_scope"} + ] + return edges, mention_name + + def parse_operand(self, node): + # need to make sure upper level name is correct; handle @keyword; type placeholder for sourcetrail nodes? + edges = [] + if isinstance(node, str): + # fall here when parsing attributes, they are given as strings; should attributes be parsed into subwords? + if "@" in node: + parts = node.split("@") + node = GNode(name=parts[0], type=parts[1]) + else: + node = GNode(name=node, type="Name") + iter_ = node + elif isinstance(node, int) or node is None: + iter_ = GNode(name=str(node), type="ast_Literal") + # iter_ = str(node) + elif isinstance(node, GNode): + iter_ = node + else: + iter_e = self.parse(node) + if type(iter_e) == str: + iter_ = iter_e + elif isinstance(iter_e, GNode): + iter_ = iter_e + elif type(iter_e) == tuple: + ext_edges, name = iter_e + assert isinstance(name, GNode) + edges.extend(ext_edges) + iter_ = name + else: + # unexpected scenario + print(node) + print(ast.dump(node)) + print(iter_e) + print(type(iter_e)) + pprint(self.source) + print(self.source[node.lineno - 1].strip()) + raise Exception() + + return iter_, edges + + def parse_and_add_operand(self, node_name, operand, type, edges): + + operand_name, ext_edges = self.parse_operand(operand) + edges.extend(ext_edges) + + if hasattr(operand, "lineno"): + edges.append({"scope": copy(self.scope[-1]), "src": operand_name, "dst": node_name, "type": type, "line": operand.lineno-1, "end_line": operand.end_lineno-1, "col_offset": operand.col_offset, "end_col_offset": operand.end_col_offset}) + else: + edges.append({"scope": copy(self.scope[-1]), "src": operand_name, "dst": node_name, "type": type}) + + # if len(ext_edges) > 0: # need this to avoid adding reverse edges to operation names and other highly connected nodes + edges.append({"scope": copy(self.scope[-1]), "src": node_name, "dst": operand_name, "type": type + "_rev"}) + + def generic_parse(self, node, operands, with_name=None, ensure_iterables=False): + + edges = [] + + if with_name is None: + node_name = self.get_name(node=node) + else: + node_name = with_name + + if len(self.scope) > 0: + edges.append({"scope": copy(self.scope[-1]), "src": node_name, "dst": self.scope[-1], "type": "mention_scope"}) + edges.append({"scope": copy(self.scope[-1]), "src": self.scope[-1], "dst": node_name, "type": "mention_scope_rev"}) + + for operand in operands: + if operand in ["body", "orelse", "finalbody"]: + self.parse_in_context(node_name, "operand", edges, node.__getattribute__(operand)) + else: + operand_ = node.__getattribute__(operand) + if operand_ is not None: + if isinstance(operand_, Iterable) and not isinstance(operand_, str): + # TODO: + # appears as leaf node if the iterable is empty. suggest adding an "EMPTY" element + for oper_ in operand_: + self.parse_and_add_operand(node_name, oper_, operand, edges) + else: + self.parse_and_add_operand(node_name, operand_, operand, edges) + + # TODO + # need to identify the benefit of this node + # maybe it is better to use node types in the graph + # edges.append({"scope": copy(self.scope[-1]), "src": node.__class__.__name__, "dst": node_name, "type": "node_type"}) + + return edges, node_name + + def parse_type_node(self, node): + # node.lineno, node.col_offset, node.end_lineno, node.end_col_offset + if node.lineno == node.end_lineno: + type_str = self.source[node.lineno][node.col_offset - 1: node.end_col_offset] + # print(type_str) + else: + type_str = "" + for ln in range(node.lineno - 1, node.end_lineno): + if ln == node.lineno - 1: + type_str += self.source[ln][node.col_offset - 1:].strip() + elif ln == node.end_lineno - 1: + type_str += self.source[ln][:node.end_col_offset].strip() + else: + type_str += self.source[ln].strip() + return type_str + + def parse_Module(self, node): + edges, module_name = self.generic_parse(node, []) + self.scope.append(module_name) + self.parse_in_context(module_name, "module", edges, node.body) + self.scope.pop(-1) + return edges, module_name + + def parse_FunctionDef(self, node): + # edges, f_name = self.generic_parse(node, ["name", "args", "returns", "decorator_list"]) + # edges, f_name = self.generic_parse(node, ["args", "returns", "decorator_list"]) + + # need to creare function name before generic_parse so that the scope is set up correctly + # scope is used to create local mentions of variable and function names + fdef_node_name = self.get_name(node=node) + self.scope.append(fdef_node_name) + + to_parse = [] + if len(node.args.args) > 0 or node.args.vararg is not None: + to_parse.append("args") + if len("decorator_list") > 0: + to_parse.append("decorator_list") + + edges, f_name = self.generic_parse(node, to_parse, with_name=fdef_node_name) + + if node.returns is not None: + # returns stores return type annotation + # can contain quotes + # https://stackoverflow.com/questions/46458470/should-you-put-quotes-around-type-annotations-in-python + # https://www.python.org/dev/peps/pep-0484/#forward-references + annotation_string = self.get_source_from_ast_range( + node.returns.lineno, node.returns.end_lineno, + node.returns.col_offset, node.returns.end_col_offset + ) + annotation = GNode(name=annotation_string, + type="type_annotation") + edges.append({"scope": copy(self.scope[-1]), "src": annotation, "dst": f_name, "type": "returned_by", "line": node.returns.lineno - 1, "end_line": node.returns.end_lineno - 1, "col_offset": node.returns.col_offset, "end_col_offset": node.returns.end_col_offset}) + # do not use reverse edges for types, will result in leak from function to function + # edges.append({"scope": copy(self.scope[-1]), "src": f_name, "dst": annotation, "type": 'returns'}) + + # if node.returns: + # print(self.source[node.lineno -1]) # can get definition string here + + self.parse_in_context(f_name, "function", edges, node.body) + + self.scope.pop(-1) + + # func_name = GNode(name=node.name, type="Name") + ext_edges, func_name = self.parse_as_mention(name=node.name) + edges.extend(ext_edges) + + assert isinstance(node.name, str) + edges.append({"scope": copy(self.scope[-1]), "src": f_name, "dst": func_name, "type": "function_name"}) + edges.append({"scope": copy(self.scope[-1]), "src": func_name, "dst": f_name, "type": "function_name_rev"}) + + return edges, f_name + + def parse_AsyncFunctionDef(self, node): + return self.parse_FunctionDef(node) + + def parse_ClassDef(self, node): + + edges, class_node_name = self.generic_parse(node, []) + + self.scope.append(class_node_name) + self.parse_in_context(class_node_name, "class", edges, node.body) + self.scope.pop(-1) + + ext_edges, cls_name = self.parse_as_mention(name=node.name) + edges.extend(ext_edges) + edges.append({"scope": copy(self.scope[-1]), "src": class_node_name, "dst": cls_name, "type": "class_name"}) + edges.append({"scope": copy(self.scope[-1]), "src": cls_name, "dst": class_node_name, "type": "class_name_rev"}) + + return edges, class_node_name + + def parse_arg(self, node): + # node.annotation stores type annotation + # if node.annotation: + # print(self.source[node.lineno-1]) # can get definition string here + # print(node.arg) + + # # included mention + name = self.get_name(node=node) + edges, mention_name = self.parse_as_mention(node.arg) + edges.append({"scope": copy(self.scope[-1]), "src": mention_name, "dst": name, "type": 'arg'}) + edges.append({"scope": copy(self.scope[-1]), "src": name, "dst": mention_name, "type": 'arg_rev'}) + # edges, name = self.generic_parse(node, ["arg"]) + if node.annotation is not None: + # can contain quotes + # https://stackoverflow.com/questions/46458470/should-you-put-quotes-around-type-annotations-in-python + # https://www.python.org/dev/peps/pep-0484/#forward-references + annotation_string = self.get_source_from_ast_range( + node.annotation.lineno, node.annotation.end_lineno, + node.annotation.col_offset, node.annotation.end_col_offset + ) + annotation = GNode(name=annotation_string, + type="type_annotation") + edges.append({"scope": copy(self.scope[-1]), "src": annotation, "dst": name, "type": 'annotation_for', "line": node.annotation.lineno-1, "end_line": node.annotation.end_lineno-1, "col_offset": node.annotation.col_offset, "end_col_offset": node.annotation.end_col_offset, "var_line": node.lineno-1, "var_end_line": node.end_lineno-1, "var_col_offset": node.col_offset, "var_end_col_offset": node.end_col_offset}) + # do not use reverse edges for types, will result in leak from function to function + # edges.append({"scope": copy(self.scope[-1]), "src": name, "dst": annotation, "type": 'annotation'}) + return edges, name + # return self.generic_parse(node, ["arg", "annotation"]) + + def parse_arguments(self, node): + # have missing fields. example: + # arguments(args=[arg(arg='self', annotation=None), arg(arg='tqdm_cls', annotation=None), arg(arg='sleep_interval', annotation=None)], vararg=None, kwonlyargs=[], kw_defaults=[], kwarg=None, defaults=[]) + + # vararg constains type annotations + return self.generic_parse(node, ["args", "vararg"]) + + def parse_With(self, node): + edges, with_name = self.generic_parse(node, ["items"]) + + self.parse_in_context(with_name, "with", edges, node.body) + + return edges, with_name + + def parse_AsyncWith(self, node): + return self.parse_With(node) + + def parse_withitem(self, node): + return self.generic_parse(node, ['context_expr', 'optional_vars']) + + def parse_ExceptHandler(self, node): + # have missing fields. example: + # not parsing "name" field + # except handler is unique for every function + return self.generic_parse(node, ["type"]) + + def parse_name(self, node): + edges = [] + # if type(node) == ast.Attribute: + # left, ext_edges = self.parse(node.value) + # right = node.attr + # return self.parse(node.value) + "___" + node.attr + if type(node) == ast.Name: + return self.parse_as_mention(str(node.id)) + elif type(node) == ast.NameConstant: + return GNode(name=str(node.value), type="NameConstant") + + def parse_Name(self, node): + return self.parse_name(node) + + def parse_op_name(self, node): + return GNode(name=node.__class__.__name__, type="Op") + # return node.__class__.__name__ + + def parse_Str(self, node): + return self.generic_parse(node, []) + # return node.s + + def parse_If(self, node): + + edges, if_name = self.generic_parse(node, ["test"]) + + self.parse_in_context(if_name, "if_true", edges, node.body) + self.parse_in_context(if_name, "if_false", edges, node.orelse) + + return edges, if_name + + def parse_For(self, node): + + edges, for_name = self.generic_parse(node, ["target", "iter"]) + + self.parse_in_context(for_name, "for", edges, node.body) + self.parse_in_context(for_name, "for_orelse", edges, node.orelse) + + return edges, for_name + + def parse_AsyncFor(self, node): + return self.parse_For(node) + + def parse_Try(self, node): + + edges, try_name = self.generic_parse(node, []) + + self.parse_in_context(try_name, "try", edges, node.body) + + for h in node.handlers: + + handler_name, ext_edges = self.parse_operand(h) + edges.extend(ext_edges) + self.parse_in_context([try_name, handler_name], ["try_except", "try_handler"], edges, h.body) + + self.parse_in_context(try_name, "try_final", edges, node.finalbody) + self.parse_in_context(try_name, "try_else", edges, node.orelse) + + return edges, try_name + + def parse_While(self, node): + + edges, while_name = self.generic_parse(node, []) + + cond_name, ext_edges = self.parse_operand(node.test) + edges.extend(ext_edges) + + self.parse_in_context([while_name, cond_name], ["while", "if_true"], edges, node.body) + + return edges, while_name + + def parse_Expr(self, node): + edges = [] + expr_name, ext_edges = self.parse_operand(node.value) + edges.extend(ext_edges) + + # for cond_name, cons_stat in zip(self.current_condition, self.condition_status): + # edges.append({"scope": copy(self.scope[-1]), "src": expr_name, "dst": cond_name, "type": "depends_on_" + cons_stat}) + return edges, expr_name + + +if __name__ == "__main__": + import sys + f_bodies = pd.read_pickle(sys.argv[1]) + failed = 0 + for ind, c in enumerate(f_bodies['body_normalized']): + if isinstance(c, str): + # c = """def g(): + # yield 1""" + try: + g = AstGraphGenerator(c.lstrip()) + except SyntaxError as e: + print(e) + continue + failed += 1 + edges = g.get_edges() + # edges.to_csv(os.path.join(os.path.dirname(sys.argv[1]), "body_edges.csv"), mode="a", index=False, header=(ind==0)) + print("\r%d/%d" % (ind, len(f_bodies['body_normalized'])), end="") + else: + print("Skipped not a string") + + print(" " * 30, end="\r") + print(failed, len(f_bodies['body_normalized'])) diff --git a/SourceCodeTools/code/python_tokens_to_bpe_subwords.py b/SourceCodeTools/code/ast/python_tokens_to_bpe_subwords.py similarity index 90% rename from SourceCodeTools/code/python_tokens_to_bpe_subwords.py rename to SourceCodeTools/code/ast/python_tokens_to_bpe_subwords.py index 9d6ad7c5..648839a8 100644 --- a/SourceCodeTools/code/python_tokens_to_bpe_subwords.py +++ b/SourceCodeTools/code/ast/python_tokens_to_bpe_subwords.py @@ -59,9 +59,9 @@ 'Pow': "**", 'FloorDiv': "//", 'GtE': ">=", - 'USub': "-", # this appearsto be the operator to chenge number sign -5 + 'USub': "-", # this appears to be the operator to change number sign -5 'Invert': "~", - 'UAdd': "+", # this appearsto be the operator to chenge number sign +5, method __pos__ + 'UAdd': "+", # this appears to be the operator to change number sign +5, method __pos__ 'MatMult': "@", 'BitXor': "^", 'LShift': "<<", @@ -71,9 +71,11 @@ "Continue": "continue", } + def op_tokenizer(op: str): return python_ops_to_bpe.get(op, op) + @lru_cache def op_tokenize_or_none(op, tokenize_func): return tokenize_func(python_ops_to_literal[op]) if op in python_ops_to_literal else None \ No newline at end of file diff --git a/SourceCodeTools/code/common.py b/SourceCodeTools/code/common.py new file mode 100644 index 00000000..299a8aac --- /dev/null +++ b/SourceCodeTools/code/common.py @@ -0,0 +1,141 @@ +import hashlib +import os +import sqlite3 + +import pandas as pd +from tqdm import tqdm + +from SourceCodeTools.code.data.file_utils import unpersist + + +class SQLTable: + def __init__(self, df, filename, table_name): + self.conn = sqlite3.connect(filename) + self.path = filename + self.table_name = table_name + + df.to_sql(self.table_name, con=self.conn, if_exists='replace', index=False, index_label=df.columns) + + def query(self, query_string): + return pd.read_sql(query_string, self.conn) + + def __del__(self): + self.conn.close() + if os.path.isfile(self.path): + os.remove(self.path) + + +def create_node_repr(nodes): + return list(zip(nodes['serialized_name'], nodes['type'])) + + +def compute_long_id(obj): + return hashlib.md5(repr(obj).encode('utf-8')).hexdigest() + + +def map_id_columns(df, column_names, mapper): + df = df.copy() + for col in column_names: + if col in df.columns: + df[col] = df[col].apply(lambda x: mapper.get(x, pd.NA)) + return df + + +def merge_with_file_if_exists(df, merge_with_file): + if os.path.isfile(merge_with_file): + original_data = unpersist(merge_with_file) + data = pd.concat([original_data, df], axis=0) + else: + data = df + return data + + +def custom_tqdm(iterable, total, message): + return tqdm(iterable, total=total, desc=message, leave=False, dynamic_ncols=True) + + +def map_columns(input_table, id_map, columns, columns_special=None): + + input_table = map_id_columns(input_table, columns, id_map) + + if columns_special is not None: + assert isinstance(columns_special, list), "`columns_special` should be iterable" + for column, map_func in columns_special: + input_table[column] = map_func(input_table[column], id_map) + + if len(input_table) == 0: + return None + else: + return input_table + + +def grow_with_chunks(chunks, additional_dtypes): + dtypes = {} + + table = None + + for chunk in chunks: + for col, type_ in additional_dtypes.items(): + if col in chunk.columns: + dtypes[col] = type_ + + chunk = chunk.astype(dtypes) + + if table is None: + table = chunk + else: + table = pd.concat([table, chunk], copy=False) + return table + + +def return_chunks(chunks, additional_dtypes): + dtypes = {} + + for chunk in chunks: + for col, type_ in additional_dtypes.items(): + if col in chunk.columns: + dtypes[col] = type_ + + chunk = chunk.astype(dtypes, copy=False) + yield chunk + + +def read_nodes(node_path, as_chunks=False): + dtypes = { + "id": "int32", + "serialized_name": "string", + } + + nodes_chunks = unpersist(node_path, dtype=dtypes, chunksize=100000) + + additional_dtypes = { + 'type': 'category', + "mentioned_in": "Int32", + "string": "string" + } + + if as_chunks is False: + return grow_with_chunks(nodes_chunks, additional_dtypes) + else: + return return_chunks(nodes_chunks, additional_dtypes) + + +def read_edges(edge_path, as_chunks=False): + dtypes = { + "id": "int32", + "source_node_id": "int32", + "target_node_id": "int32", + } + + edge_chunks = unpersist(edge_path, dtype=dtypes, chunksize=100000) + + additional_types = { + "type": 'category', + "mentioned_in": "Int32", + "file_id": "Int32" + } + + if as_chunks: + return return_chunks(edge_chunks, additional_types) + else: + return grow_with_chunks(edge_chunks, additional_types) diff --git a/SourceCodeTools/code/data/AbstractDatasetCreator.py b/SourceCodeTools/code/data/AbstractDatasetCreator.py new file mode 100644 index 00000000..f9e2dee5 --- /dev/null +++ b/SourceCodeTools/code/data/AbstractDatasetCreator.py @@ -0,0 +1,458 @@ +import logging +import os +import shelve +import shutil +import tempfile +from abc import abstractmethod +from collections import defaultdict +from copy import copy +from functools import partial +from os.path import join + +from tqdm import tqdm + +from SourceCodeTools.code.annotator_utils import map_offsets +from SourceCodeTools.code.common import map_columns, read_edges, read_nodes +from SourceCodeTools.code.data.file_utils import get_random_name, unpersist, persist, unpersist_if_present +from SourceCodeTools.code.data.sourcetrail.sourcetrail_types import special_mapping + + +class AbstractDatasetCreator: + """ + Merges several environments indexed with Sourcetrail into a single graph. + """ + + merging_specification = { + "nodes.bz2": {"columns": ['id'], "output_path": "common_nodes.jsonl", "ensure_unique_with": ['type', 'serialized_name']}, + "edges.bz2": {"columns": ['target_node_id', 'source_node_id'], "output_path": "common_edges.jsonl"}, + "source_graph_bodies.bz2": {"columns": ['id'], "output_path": "common_source_graph_bodies.jsonl", "columns_special": [("replacement_list", map_offsets)]}, + "function_variable_pairs.bz2": {"columns": ['src'], "output_path": "common_function_variable_pairs.jsonl"}, + "call_seq.bz2": {"columns": ['src', 'dst'], "output_path": "common_call_seq.jsonl"}, + + "nodes_with_ast.bz2": {"columns": ['id', 'mentioned_in'], "output_path": "common_nodes.jsonl", "ensure_unique_with": ['type', 'serialized_name']}, + "edges_with_ast.bz2": {"columns": ['target_node_id', 'source_node_id', 'mentioned_in'], "output_path": "common_edges.jsonl"}, + "offsets.bz2": {"columns": ['node_id'], "output_path": "common_offsets.jsonl", "columns_special": [("mentioned_in", map_offsets)]}, + "filecontent_with_package.bz2": {"columns": [], "output_path": "common_filecontent.jsonl"}, + "name_mappings.bz2": {"columns": [], "output_path": "common_name_mappings.jsonl"}, + } + + files_for_merging = [ + "nodes.bz2", "edges.bz2", "source_graph_bodies.bz2", "function_variable_pairs.bz2", "call_seq.bz2" + ] + files_for_merging_with_ast = [ + "nodes_with_ast.bz2", "edges_with_ast.bz2", "source_graph_bodies.bz2", "function_variable_pairs.bz2", + "call_seq.bz2", "offsets.bz2", "filecontent_with_package.bz2", "name_mappings.bz2" + ] + + restricted_edges = {} + restricted_in_types = {} + + type_annotation_edge_types = [] + + environments = None + edge_priority = dict() + + def __init__( + self, path, lang, bpe_tokenizer, create_subword_instances, connect_subwords, only_with_annotations, + do_extraction=False, visualize=False, track_offsets=False, remove_type_annotations=False, + recompute_l2g=False + ): + """ + :param path: path to source code dataset + :param lang: language to use for AST parser (only Python for now) + :param bpe_tokenizer: path to bpe tokenizer model + :param create_subword_instances: whether to create nodes that represent subword instances (doubles the + number of nodes) + :param connect_subwords: whether to connect subword instances so that the order of subwords is stored + in the graph. Has effect only when create_subword_instances=True + :param only_with_annotations: include only packages that have type annotations into the final graph + :param do_extraction: when True, process source code and extract AT edges. Otherwise, existing files are + used. + :param visualize: visualize graph using pygraphviz and store as PDF (infeasible for large graphs) + :param track_offsets: store offset information and map node occurrences to global graph ids + :param remove_type_annotations: when True, removes all type annotations from the graph and stores then + in a file called `type_annotations.bz2` + :param recompute_l2g: when True, run merging operation again, without extrcting AST nodes and edges second time + """ + self.indexed_path = path + self.lang = lang + self.bpe_tokenizer = bpe_tokenizer + self.create_subword_instances = create_subword_instances + self.connect_subwords = connect_subwords + self.only_with_annotations = only_with_annotations + self.extract = do_extraction + self.visualize = visualize + self.track_offsets = track_offsets + self.remove_type_annotations = remove_type_annotations + self.recompute_l2g = recompute_l2g + + self.path = path + self._prepare_environments() + + self._init_cache() + + def _init_cache(self): + # TODO this is wrong, use standard utilities + rnd_name = get_random_name() + + self.tmp_dir = os.path.join(tempfile.gettempdir(), rnd_name) + if os.path.isdir(self.tmp_dir): + shutil.rmtree(self.tmp_dir) + os.mkdir(self.tmp_dir) + + self.local2global_cache_filename = os.path.join(self.tmp_dir, "local2global_cache.db") + self.local2global_cache = shelve.open(self.local2global_cache_filename) + + def __del__(self): + self.local2global_cache.close() + shutil.rmtree(self.tmp_dir) + # os.remove(self.local2global_cache_filename) # TODO nofile on linux, need to check + + def handle_parallel_edges(self, edges_path): + logging.info("Handle parallel edges") + last_id = 0 + + global_edge_types = set(special_mapping.keys()) | set(special_mapping.values()) + + existing_global_edges = set() + + temp_edges = join(os.path.dirname(edges_path), "temp_" + os.path.basename(edges_path)) + + for ind, edges in enumerate(read_edges(edges_path, as_chunks=True)): + edges["id"] = range(last_id, len(edges) + last_id) + + edge_bank = defaultdict(list) + ids_to_remove = set() + for id_, type_, src, dst in edges[["id", "type", "source_node_id", "target_node_id"]].values: + if type_ in global_edge_types: + global_edge = (type_, src, dst) + if global_edge not in existing_global_edges: + existing_global_edges.add(global_edge) + else: + ids_to_remove.add(id_) + else: + edge_bank[(src, dst)].append((id_, type_)) + + for key, parallel_edges in edge_bank.items(): + if len(parallel_edges) > 1: + parallel_edges = sorted(parallel_edges, key=lambda x: self.edge_priority.get(x[1], 3)) + ids_to_remove.update(pe[0] for pe in parallel_edges[1:]) + + edges = edges[ + edges["id"].apply(lambda id_: id_ not in ids_to_remove) + ] + + edges["id"] = range(last_id, len(edges) + last_id) + last_id = len(edges) + last_id + + kwargs = self.get_writing_mode(temp_edges.endswith("csv"), first_written=ind != 0) + persist(edges, temp_edges, **kwargs) + + os.remove(edges_path) + os.rename(temp_edges, edges_path) + + def post_pruning(self, nodes_path, edges_path): + logging.info("Post pruning") + + restricted_nodes = set() + + for nodes in read_nodes(nodes_path, as_chunks=True): + restricted_nodes.update( + nodes[ + nodes["type"].apply(lambda type_: type_ in self.restricted_in_types) + ]["id"] + ) + + temp_edges = join(os.path.dirname(edges_path), "temp_" + os.path.basename(edges_path)) + + for ind, edges in enumerate(read_edges(edges_path, as_chunks=True)): + edges = edges[ + edges["type"].apply(lambda type_: type_ not in self.restricted_edges) + ] + + edges = edges[ + edges["target_node_id"].apply(lambda type_: type_ not in restricted_nodes) + ] + + kwargs = self.get_writing_mode(temp_edges.endswith("csv"), first_written=ind != 0) + persist(edges, temp_edges, **kwargs) + + os.remove(edges_path) + os.rename(temp_edges, edges_path) + + def compact_mapping_for_l2g(self, global_nodes, filename): + if len(global_nodes) > 0: + self.update_l2g_file( + mapping=self.create_compact_mapping(global_nodes), filename=filename + ) + + @staticmethod + def create_compact_mapping(node_ids): + return dict(zip(node_ids, range(len(node_ids)))) + + def update_l2g_file(self, mapping, filename): + for env_path in tqdm(self.environments, desc=f"Fixing {filename}"): + filepath = os.path.join(env_path, filename) + if not os.path.isfile(filepath): + continue + l2g = unpersist(filepath) + l2g["global_id"] = l2g["global_id"].apply(lambda id_: mapping.get(id_, None)) + persist(l2g, filepath) + + def get_local2global(self, path): + if path in self.local2global_cache: + return self.local2global_cache[path] + else: + local2global_df = unpersist_if_present(path) + if local2global_df is None: + return None + else: + local2global = dict(zip(local2global_df['id'], local2global_df['global_id'])) + self.local2global_cache[path] = local2global + return local2global + + @staticmethod + def persist_if_not_none(table, dir, name): + if table is not None: + path = os.path.join(dir, name) + persist(table, path) + + def write_type_annotation_flag(self, edges, output_dir): + if len(self.type_annotation_edge_types) > 0: + query_str = " or ".join(f"type == '{edge_type}'" for edge_type in self.type_annotation_edge_types) + if len(edges.query(query_str)) > 0: + with open(os.path.join(output_dir, "has_annotations"), "w") as has_annotations: + pass + + def write_local(self, dir, local2global=None, local2global_with_ast=None, **kwargs): + + if not self.recompute_l2g: + for var_name, var_ in kwargs.items(): + self.persist_if_not_none(var_, dir, var_name + ".bz2") + + self.persist_if_not_none(local2global, dir, "local2global.bz2") + self.persist_if_not_none(local2global_with_ast, dir, "local2global_with_ast.bz2") + + def merge_files(self, env_path, filename, map_filename, columns_to_map, original, columns_special=None): + input_table_path = join(env_path, filename) + local2global = self.get_local2global(join(env_path, map_filename)) + if os.path.isfile(input_table_path) and local2global is not None: + input_table = unpersist(input_table_path) + if self.only_with_annotations: + if not os.path.isfile(join(env_path, "has_annotations")): + return original + new_table = map_columns(input_table, local2global, columns_to_map, columns_special=columns_special) + if original is None: + return new_table + else: + return original.append(new_table) + else: + return original + + def read_mapped_local(self, env_path, filename, map_filename, columns_to_map, columns_special=None): + input_table_path = join(env_path, filename) + local2global = self.get_local2global(join(env_path, map_filename)) + if os.path.isfile(input_table_path) and local2global is not None: + if self.only_with_annotations: + if not os.path.isfile(join(env_path, "has_annotations")): + return None + input_table = unpersist(input_table_path) + new_table = map_columns(input_table, local2global, columns_to_map, columns_special=columns_special) + return new_table + else: + return None + + def get_writing_mode(self, is_csv, first_written): + kwargs = {} + if first_written is True: + kwargs["mode"] = "a" + if is_csv: + kwargs["header"] = False + return kwargs + + def create_global_file( + self, local_file, local2global_file, columns, output_path, message, ensure_unique_with=None, + columns_special=None + ): + assert output_path.endswith("json") or output_path.endswith("csv") + + if ensure_unique_with is not None: + unique_values = set() + else: + unique_values = None + + first_written = False + + for ind, env_path in tqdm( + enumerate(self.environments), desc=message, leave=True, + dynamic_ncols=True, total=len(self.environments) + ): + mapped_local = self.read_mapped_local( + env_path, local_file, local2global_file, columns, columns_special=columns_special + ) + + if mapped_local is not None: + if unique_values is not None: + unique_verify = list(zip(*(mapped_local[col_name] for col_name in ensure_unique_with))) + + mapped_local = mapped_local.loc[ + map(lambda x: x not in unique_values, unique_verify) + ] + unique_values.update(unique_verify) + + kwargs = self.get_writing_mode(output_path.endswith("csv"), first_written) + + persist(mapped_local, output_path, **kwargs) + first_written = True + + + # def create_global_file( + # self, local_file, local2global_file, columns, output_path, message, ensure_unique_with=None, + # columns_special=None + # ): + # global_table = None + # for ind, env_path in tqdm( + # enumerate(self.environments), desc=message, leave=True, + # dynamic_ncols=True, total=len(self.environments) + # ): + # global_table = self.merge_files( + # env_path, local_file, local2global_file, columns, global_table, columns_special=columns_special + # ) + # + # if ensure_unique_with is not None: + # global_table = global_table.drop_duplicates(subset=ensure_unique_with) + # + # if global_table is not None: + # global_table.reset_index(drop=True, inplace=True) + # assert len(global_table) == len(global_table.index.unique()) + # + # persist(global_table, output_path) + + def filter_orphaned_nodes(self, nodes_path, edges_path): + logging.info("Filter orphaned nodes") + active_nodes = set() + + for edges in read_edges(edges_path, as_chunks=True): + active_nodes.update(edges['source_node_id']) + active_nodes.update(edges['target_node_id']) + + temp_nodes = join(os.path.dirname(nodes_path), "temp_" + os.path.basename(nodes_path)) + + for ind, nodes in enumerate(read_nodes(nodes_path, as_chunks=True)): + nodes = nodes[ + nodes['id'].apply(lambda id_: id_ in active_nodes) + ] + + kwargs = self.get_writing_mode(temp_nodes.endswith("csv"), first_written=ind != 0) + persist(nodes, temp_nodes, **kwargs) + + os.remove(nodes_path) + os.rename(temp_nodes, nodes_path) + + def join_files(self, files, local2global_filename, output_dir): + for file in files: + params = copy(self.merging_specification[file]) + params["output_path"] = join(output_dir, params.pop("output_path")) + self.create_global_file(file, local2global_filename, message=f"Merging {file}", **params) + + def merge_graph_without_ast(self, output_path): + self.join_files(self.files_for_merging, "local2global.bz2", output_path) + + get_path = partial(join, output_path) + + nodes_path = get_path("common_nodes.json") + edges_path = get_path("common_edges.json") + + self.filter_orphaned_nodes( + nodes_path, + edges_path, + ) + node_names = self.extract_node_names( + nodes_path, min_count=2 + ) + if node_names is not None: + persist(node_names, get_path("node_names.json")) + + self.handle_parallel_edges(edges_path) + + if self.visualize: + self.visualize_func( + read_nodes(nodes_path), + read_edges(edges_path), + get_path("visualization.pdf") + ) + + def merge_graph_with_ast(self, output_path): + + self.join_files(self.files_for_merging_with_ast, "local2global_with_ast.bz2", output_path) + + get_path = partial(join, output_path) + + nodes_path = get_path("common_nodes.json") + edges_path = get_path("common_edges.json") + + if self.remove_type_annotations: + self.filter_type_edges(nodes_path, edges_path) + + self.handle_parallel_edges(edges_path) + + self.post_pruning(nodes_path, edges_path) + + self.filter_orphaned_nodes( + nodes_path, + edges_path, + ) + # persist(global_nodes, get_path("common_nodes.json")) + node_names = self.extract_node_names( + nodes_path, min_count=2 + ) + if node_names is not None: + persist(node_names, get_path("node_names.json")) + + if self.visualize: + self.visualize_func( + read_nodes(nodes_path), + read_edges(edges_path), + get_path("visualization.pdf") + ) + + @abstractmethod + def create_output_dirs(self, output_path): + pass + + @abstractmethod + def _prepare_environments(self): + pass + + @staticmethod + @abstractmethod + def filter_type_edges(nodes, edges): + pass + + @staticmethod + @abstractmethod + def extract_node_names(nodes, min_count): + pass + + @abstractmethod + def do_extraction(self): + pass + + @abstractmethod + def merge(self, output_directory): + pass + + if self.extract: + logging.info("Extracting...") + self.do_extraction() + + no_ast_path, with_ast_path = self.create_output_dirs(output_directory) + + if not self.only_with_annotations: + self.merge_graph_without_ast(no_ast_path) + + self.merge_graph_with_ast(with_ast_path) + + @abstractmethod + def visualize_func(self, nodes, edges, output_path): + pass \ No newline at end of file diff --git a/SourceCodeTools/code/data/ast_graph/build_ast_graph.py b/SourceCodeTools/code/data/ast_graph/build_ast_graph.py new file mode 100644 index 00000000..af194b3e --- /dev/null +++ b/SourceCodeTools/code/data/ast_graph/build_ast_graph.py @@ -0,0 +1,743 @@ +import hashlib +import logging +import os.path +import shutil +import sys +from os.path import join + +import numpy as np +import pandas as pd + +from tqdm import tqdm + +from SourceCodeTools.cli_arguments import AstDatasetCreatorArguments +from SourceCodeTools.code.ast import has_valid_syntax +from SourceCodeTools.code.common import read_nodes, read_edges +from SourceCodeTools.code.data.AbstractDatasetCreator import AbstractDatasetCreator +from SourceCodeTools.code.data.ast_graph.extract_node_names import extract_node_names +from SourceCodeTools.code.data.ast_graph.filter_type_edges import filter_type_edges, filter_type_edges_with_chunks +from SourceCodeTools.code.data.file_utils import persist, unpersist, unpersist_if_present +from SourceCodeTools.code.data.ast_graph.draw_graph import visualize +from SourceCodeTools.code.ast.python_ast2 import AstGraphGenerator, GNode, PythonSharedNodes +from SourceCodeTools.code.annotator_utils import adjust_offsets2, map_offsets, to_offsets, get_cum_lens +from SourceCodeTools.code.data.ast_graph.local2global import get_local2global +from SourceCodeTools.nlp.string_tools import get_byte_to_char_map + + +class MentionTokenizer: + def __init__(self, bpe_tokenizer_path, create_subword_instances, connect_subwords): + from SourceCodeTools.nlp.embed.bpe import make_tokenizer + from SourceCodeTools.nlp.embed.bpe import load_bpe_model + + self.bpe = make_tokenizer(load_bpe_model(bpe_tokenizer_path)) \ + if bpe_tokenizer_path else None + self.create_subword_instances = create_subword_instances + self.connect_subwords = connect_subwords + + def replace_mentions_with_subwords(self, edges): + """ + Process edges and tokenize certain node types + :param edges: List of edges + :return: List of edges, including new edges for subword tokenization + """ + + if self.create_subword_instances: + def produce_subw_edges(subwords, dst): + return self.produce_subword_edges_with_instances(subwords, dst) + else: + def produce_subw_edges(subwords, dst): + return self.produce_subword_edges(subwords, dst, self.connect_subwords) + + new_edges = [] + for edge in edges: + + if edge['type'] == "local_mention": + + dst = edge['dst'] + + if self.bpe is not None: + if hasattr(dst, "name_scope") and dst.name_scope == "local": + # TODO + # this rule seems to be irrelevant now + subwords = self.bpe(dst.name.split("@")[0]) + else: + subwords = self.bpe(edge['src'].name) + + new_edges.extend(produce_subw_edges(subwords, dst)) + else: + new_edges.append(edge) + + elif self.bpe is not None and edge["type"] == "__global_name": + # should not have global edges + # subwords = self.bpe(edge['src'].name) + # new_edges.extend(produce_subw_edges(subwords, edge['dst'])) + pass + elif self.bpe is not None and edge['src'].type in PythonSharedNodes.tokenizable_types_and_annotations: + new_edges.append(edge) + if edge['type'] != "global_mention_rev": + # should not have global edges here + pass + + dst = edge['src'] + subwords = self.bpe(dst.name) + new_edges.extend(produce_subw_edges(subwords, dst)) + else: + new_edges.append(edge) + + return new_edges + + @staticmethod + def connect_prev_next_subwords(edges, current, prev_subw, next_subw): + if next_subw is not None: + edges.append({ + 'src': current, + 'dst': next_subw, + 'type': 'next_subword', + 'offsets': None + }) + if prev_subw is not None: + edges.append({ + 'src': current, + 'dst': prev_subw, + 'type': 'prev_subword', + 'offsets': None + }) + + def produce_subword_edges(self, subwords, dst, connect_subwords=False): + new_edges = [] + + subwords = list(map(lambda x: GNode(name=x, type="subword"), subwords)) + for ind, subword in enumerate(subwords): + new_edges.append({ + 'src': subword, + 'dst': dst, + 'type': 'subword', + 'offsets': None + }) + if connect_subwords: + self.connect_prev_next_subwords(new_edges, subword, subwords[ind - 1] if ind > 0 else None, + subwords[ind + 1] if ind < len(subwords) - 1 else None) + return new_edges + + def produce_subword_edges_with_instances(self, subwords, dst, connect_subwords=True): + new_edges = [] + + subwords = list(map(lambda x: GNode(name=x, type="subword"), subwords)) + instances = list(map(lambda x: GNode( + name=x.name + "@" + dst.name, type="subword_instance"), subwords)) + for ind, subword in enumerate(subwords): + subword_instance = instances[ind] + new_edges.append({ + 'src': subword, + 'dst': subword_instance, + 'type': 'subword_instance', + 'offsets': None + }) + new_edges.append({ + 'src': subword_instance, + 'dst': dst, + 'type': 'subword', + 'offsets': None + }) + if connect_subwords: + self.connect_prev_next_subwords(new_edges, subword_instance, instances[ind - 1] if ind > 0 else None, + instances[ind + 1] if ind < len(instances) - 1 else None) + return new_edges + + +class NodeIdResolver: + def __init__(self): + self.node_ids = {} + self.new_nodes = [] + self.stashed_nodes = [] + + self._resolver_cache = dict() + + def stash_new_nodes(self): + """ + Put new nodes into temporary storage. + :return: Nothing + """ + self.stashed_nodes.extend(self.new_nodes) + self.new_nodes = [] + + def get_node_id(self, type_name): + return hashlib.md5(type_name.encode('utf-8')).hexdigest() + + def resolve_node_id(self, node, **kwargs): + """ + Resolve node id from name and type, create new node is no nodes like that found. + :param node: node + :param kwargs: + :return: updated node (return object with the save reference as input) + """ + if not hasattr(node, "id"): + node_repr = f"{node.name.strip()}_{node.type.strip()}" + + if node_repr in self.node_ids: + node_id = self.node_ids[node_repr] + node.setprop("id", node_id) + else: + new_id = self.get_node_id(node_repr) + self.node_ids[node_repr] = new_id + + if not PythonSharedNodes.is_shared(node) and not node.name == "unresolved_name": + assert "0x" in node.name + + self.new_nodes.append( + { + "id": new_id, + "type": node.type, + "serialized_name": node.name, + "mentioned_in": pd.NA, + "string": node.string + } + ) + if hasattr(node, "scope"): + self.resolve_node_id(node.scope) + self.new_nodes[-1]["mentioned_in"] = node.scope.id + node.setprop("id", new_id) + return node + + def prepare_for_write(self, from_stashed=False): + nodes = self.new_nodes_for_write(from_stashed)[ + ['id', 'type', 'serialized_name', 'mentioned_in', 'string'] + ] + + return nodes + + def new_nodes_for_write(self, from_stashed=False): + + new_nodes = pd.DataFrame( + self.new_nodes if not from_stashed else self.stashed_nodes) + if len(new_nodes) == 0: + return None + + new_nodes = new_nodes[ + ['id', 'type', 'serialized_name', 'mentioned_in', 'string'] + ].astype({"mentioned_in": "string", "id": "string"}) + + return new_nodes + + def adjust_ast_node_types(self, mapping, from_stashed=False): + nodes = self.new_nodes if not from_stashed else self.stashed_nodes + + for node in nodes: + node["type"] = mapping.get(node["type"], node["type"]) + + def drop_nodes(self, node_ids_to_drop, from_stashed=False): + nodes = self.new_nodes if not from_stashed else self.stashed_nodes + + position = 0 + while position < len(nodes): + if nodes[position]["id"] in node_ids_to_drop: + nodes.pop(position) + else: + position += 1 + + def map_mentioned_in_to_global(self, mapping, from_stashed=False): + nodes = self.new_nodes if not from_stashed else self.stashed_nodes + + for node in nodes: + node["mentioned_in"] = mapping.get( + node["mentioned_in"], node["mentioned_in"]) + + +class AstProcessor(AstGraphGenerator): + def get_edges(self, as_dataframe=True): + edges = [] + all_edges, top_node_name = self.parse(self.root) + edges.extend(all_edges) + + if as_dataframe: + df = pd.DataFrame(edges) + df = df.astype( + {col: "Int32" for col in df.columns if col not in {"src", "dst", "type"}}) + + body = "\n".join(self.source) + cum_lens = get_cum_lens(body, as_bytes=True) + byte2char = get_byte_to_char_map(body) + + def format_offsets(edges: pd.DataFrame): + edges["start_line__end_line__start_column__end_column"] = list( + zip(edges["line"], edges["end_line"], + edges["col_offset"], edges["end_col_offset"]) + ) + + def into_offset(range): + try: + return to_offsets(body, [(*range, None)], cum_lens=cum_lens, b2c=byte2char, as_bytes=True)[-1][:2] + except: + return None + + edges["offsets"] = edges["start_line__end_line__start_column__end_column"].apply( + into_offset) + edges.drop( + axis=1, + labels=[ + "start_line__end_line__start_column__end_column", + "line", + "end_line", + "col_offset", + "end_col_offset" + ], inplace=True + ) + + format_offsets(df) + return df + else: + body = "\n".join(self.source) + cum_lens = get_cum_lens(body, as_bytes=True) + byte2char = get_byte_to_char_map(body) + + def format_offsets(edge): + def into_offset(range): + try: + return to_offsets(body, [(*range, None)], cum_lens=cum_lens, b2c=byte2char, as_bytes=True)[-1][:2] + except: + return None + + if "line" in edge: + edge["offsets"] = into_offset( + (edge["line"], edge["end_line"], + edge["col_offset"], edge["end_col_offset"]) + ) + edge.pop("line") + edge.pop("end_line") + edge.pop("col_offset") + edge.pop("end_col_offset") + else: + edge["offsets"] = None + if "var_line" in edge: + edge["var_offsets"] = into_offset( + (edge["var_line"], edge["var_end_line"], + edge["var_col_offset"], edge["var_end_col_offset"]) + ) + edge.pop("var_line") + edge.pop("var_end_line") + edge.pop("var_col_offset") + edge.pop("var_end_col_offset") + + for edge in edges: + format_offsets(edge) + + return edges + + +def standardize_new_edges(edges, node_resolver, mention_tokenizer): + """ + Tokenize relevant node names, assign id to every node, collapse edge representation to id-based + :param edges: list of edges + :param node_resolver: helper class that tracks node ids + :param mention_tokenizer: helper class that performs tokenization of relevant nodes + :return: + """ + + edges = mention_tokenizer.replace_mentions_with_subwords(edges) + + def resolve_node_id(node): return node_resolver.resolve_node_id(node) + def extract_id(node): return node.id + + for edge in edges: + edge["src"] = resolve_node_id(edge["src"]) + edge["dst"] = resolve_node_id(edge["dst"]) + if "scope" in edge: + edge["scope"] = resolve_node_id(edge["scope"]) + + for edge in edges: + edge["src"] = extract_id(edge["src"]) + edge["dst"] = extract_id(edge["dst"]) + if "scope" in edge: + edge["scope"] = extract_id(edge["scope"]) + else: + edge["scope"] = pd.NA + edge["file_id"] = pd.NA + + return edges + + +def process_code_without_index(source_code, node_resolver, mention_tokenizer, track_offsets=False): + try: + ast_processor = AstProcessor(source_code) + except: + return None, None, None + try: # TODO recursion error does not appear consistently. The issue is probably with library versions... + edges = ast_processor.get_edges(as_dataframe=False) + except RecursionError: + return None, None, None + + if len(edges) == 0: + return None, None, None + + # tokenize names, replace nodes with their ids + edges = standardize_new_edges(edges, node_resolver, mention_tokenizer) + + if track_offsets: + def get_valid_offsets(edges): + """ + :param edges: Dictionary that represents edge. Information is tored in edges but is related to source node + :return: Information about location of this edge (offset) in the source file in fromat (start, end, node_id) + """ + return [(edge["offsets"][0], edge["offsets"][1], edge["src"], edge["scope"]) for edge in edges if edge["offsets"] is not None] + + # recover ast offsets for the current file + ast_offsets = get_valid_offsets(edges) + else: + ast_offsets = None + + return edges, ast_offsets + + +def build_ast_only_graph( + source_codes, bpe_tokenizer_path, create_subword_instances, connect_subwords, lang, track_offsets=False +): + node_resolver = NodeIdResolver() + mention_tokenizer = MentionTokenizer( + bpe_tokenizer_path, create_subword_instances, connect_subwords) + all_ast_edges = [] + all_offsets = [] + + for package, source_code_id, source_code in tqdm(source_codes, desc="Processing modules"): + source_code_ = source_code.lstrip() + initial_strip = source_code[:len(source_code) - len(source_code_)] + + if not has_valid_syntax(source_code): + continue + + edges, ast_offsets = process_code_without_index( + source_code, node_resolver, mention_tokenizer, track_offsets=track_offsets + ) + + if ast_offsets is not None: + adjust_offsets2(ast_offsets, len(initial_strip)) + + if edges is None: + continue + + # afterprocessing + + for edge in edges: + edge["file_id"] = source_code_id + + # finish afterprocessing + + all_ast_edges.extend(edges) + + def format_offsets(ast_offsets, target): + """ + Format offset as a record and add to the common storage for offsets + :param ast_offsets: + :param target: List where all other offsets are stored. + :return: Nothing + """ + if ast_offsets is not None: + for offset in ast_offsets: + target.append({ + "file_id": source_code_id, + "start": offset[0], + "end": offset[1], + "node_id": offset[2], + "mentioned_in": offset[3], + "string": source_code[offset[0]: offset[1]], + "package": package + }) + + format_offsets(ast_offsets, target=all_offsets) + + node_resolver.stash_new_nodes() + + all_ast_nodes = node_resolver.new_nodes_for_write(from_stashed=True) + + if all_ast_nodes is None: + return None, None, None + + def prepare_edges(all_ast_edges): + all_ast_edges = pd.DataFrame(all_ast_edges) + all_ast_edges.drop_duplicates(["type", "src", "dst"], inplace=True) + all_ast_edges = all_ast_edges.query("src != dst") + all_ast_edges["id"] = 0 + + all_ast_edges = all_ast_edges[["id", "type", "src", "dst", "file_id", "scope"]] \ + .rename({'src': 'source_node_id', 'dst': 'target_node_id', 'scope': 'mentioned_in'}, axis=1) \ + .astype({'file_id': 'Int32', "mentioned_in": 'string'}) + + all_ast_edges["id"] = range(len(all_ast_edges)) + return all_ast_edges + + all_ast_edges = prepare_edges(all_ast_edges) + + if len(all_offsets) > 0: + all_offsets = pd.DataFrame(all_offsets) + else: + all_offsets = None + + node2id = dict(zip(all_ast_nodes["id"], range(len(all_ast_nodes)))) + + def map_columns_to_int(table, dense_columns, sparse_columns): + types = {column: "int64" for column in dense_columns} + types.update({column: "Int64" for column in sparse_columns}) + + for column, dtype in types.items(): + table[column] = table[column].apply(node2id.get).astype(dtype) + + map_columns_to_int(all_ast_nodes, dense_columns=[ + "id"], sparse_columns=["mentioned_in"]) + map_columns_to_int( + all_ast_edges, + dense_columns=["source_node_id", "target_node_id"], + sparse_columns=["mentioned_in"] + ) + map_columns_to_int(all_offsets, dense_columns=[ + "node_id"], sparse_columns=["mentioned_in"]) + + return all_ast_nodes, all_ast_edges, all_offsets + + +pd.options.mode.chained_assignment = None # default='warn' + + +def create_test_data(output_dir): + # [(id, source), (id, source)] + test_code = pd.DataFrame.from_records([ + {"id": 1, + "filecontent": "import numpy\nnumpy.array([1,2,3])", "package": "any_name_1"}, + {"id": 2, "filecontent": "from numpy.submodule import fn1 as f1, fn2 as f2\n", + "package": "can use the same name here any_name_1"}, + {"id": 3, "filecontent": """try: + a = b +except Exception as e: + a = c +else: + a = d +finally: + print(a)""", "package": "a"} + ]) + persist(test_code, os.path.join(output_dir, "source_code.bz2")) + + +def build_ast_graph_from_modules(): + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("source_code", type=str, + help="Path to DataFrame pickle (written with pandas.to_pickle, use `bz2` format).") + parser.add_argument("output_path") + parser.add_argument("--bpe_tokenizer", type=str, + help="Path to sentencepiece model. When provided, names will be subtokenized.") + parser.add_argument("--visualize", action="store_true", + help="Visualize graph. Do not use on large graphs.") + parser.add_argument("--create_test_data", action="store_true", + help="Visualize graph. Do not use on large graphs.") + args = parser.parse_args() + + if args.create_test_data: + print(f"Creating test data in {args.output_path}") + create_test_data(args.output_path) + sys.exit() + + source_code = unpersist(args.source_code) + + output_dir = args.output_path + + nodes, edges, offsets = build_ast_only_graph( + zip(source_code["package"], source_code["id"], + source_code["filecontent"]), args.bpe_tokenizer, + create_subword_instances=False, connect_subwords=False, lang="py", track_offsets=True + ) + + print(f"Writing output to {output_dir}") + persist(source_code, os.path.join(output_dir, "common_filecontent.bz2")) + persist(nodes, os.path.join(output_dir, "common_nodes.bz2")) + persist(edges, os.path.join(output_dir, "common_edges.bz2")) + persist(offsets, os.path.join(output_dir, "common_offsets.bz2")) + + if args.visualize: + visualize(nodes, edges, os.path.join(output_dir, "visualization.pdf")) + + +class AstDatasetCreator(AbstractDatasetCreator): + + merging_specification = { + "source_graph_bodies.bz2": {"columns": ['id'], "output_path": "common_source_graph_bodies.json", "columns_special": [("replacement_list", map_offsets)]}, + "function_variable_pairs.bz2": {"columns": ['src'], "output_path": "common_function_variable_pairs.json"}, + "call_seq.bz2": {"columns": ['src', 'dst'], "output_path": "common_call_seq.json"}, + + "nodes_with_ast.bz2": {"columns": ['id', 'mentioned_in'], "output_path": "common_nodes.json", "ensure_unique_with": ['type', 'serialized_name']}, + "edges_with_ast.bz2": {"columns": ['target_node_id', 'source_node_id', 'mentioned_in'], "output_path": "common_edges.json"}, + "offsets.bz2": {"columns": ['node_id', 'mentioned_in'], "output_path": "common_offsets.json"}, + "filecontent_with_package.bz2": {"columns": [], "output_path": "common_filecontent.json"}, + "name_mappings.bz2": {"columns": [], "output_path": "common_name_mappings.json"}, + } + + files_for_merging_with_ast = [ + "nodes_with_ast.bz2", "edges_with_ast.bz2", "source_graph_bodies.bz2", "function_variable_pairs.bz2", + "call_seq.bz2", "offsets.bz2", "filecontent_with_package.bz2" + ] + + edge_priority = { + "next": -1, "prev": -1, "global_mention": -1, "global_mention_rev": -1, + "calls": 0, + "called_by": 0, + "defines": 1, + "defined_in": 1, + "inheritance": 1, + "inherited_by": 1, + "imports": 1, + "imported_by": 1, + "uses": 2, + "used_by": 2, + "uses_type": 2, + "type_used_by": 2, + "mention_scope": 10, + "mention_scope_rev": 10, + "defined_in_function": 4, + "defined_in_function_rev": 4, + "defined_in_class": 5, + "defined_in_class_rev": 5, + "defined_in_module": 6, + "defined_in_module_rev": 6 + } + + restricted_edges = {"global_mention_rev"} + restricted_in_types = { + "Op", "Constant", "#attr#", "#keyword#", + 'CtlFlow', 'JoinedStr', 'Name', 'ast_Literal', + 'subword', 'type_annotation' + } + + type_annotation_edge_types = ['annotation_for', 'returned_by'] + + def __init__( + self, path, lang, bpe_tokenizer, create_subword_instances, connect_subwords, only_with_annotations, + do_extraction=False, visualize=False, track_offsets=False, remove_type_annotations=False, + recompute_l2g=False, chunksize=10000, keep_frac=1.0, seed=None + ): + self.chunksize = chunksize + self.keep_frac = keep_frac + self.seed = seed + super().__init__( + path, lang, bpe_tokenizer, create_subword_instances, connect_subwords, only_with_annotations, + do_extraction, visualize, track_offsets, remove_type_annotations, recompute_l2g + ) + + def __del__(self): + # TODO use /tmp and add flag for overriding temp folder location + if hasattr(self, "temp_path") and os.path.isdir(self.temp_path): + shutil.rmtree(self.temp_path) + # pass + + def _prepare_environments(self): + dataset_location = os.path.dirname(self.path) + temp_path = os.path.join(dataset_location, "temp_graph_builder") + self.temp_path = temp_path + if os.path.isdir(temp_path): + raise FileExistsError(f"Directory exists: {temp_path}") + os.mkdir(temp_path) + + rnd_state = np.random.RandomState(self.seed) + print('path', self.path) + for ind, chunk in enumerate(pd.read_csv(self.path, chunksize=self.chunksize)): + chunk = chunk.sample(frac=self.keep_frac, random_state=rnd_state) + chunk_path = os.path.join(temp_path, f"chunk_{ind}") + os.mkdir(chunk_path) + persist(chunk, os.path.join(chunk_path, "source_code.bz2")) + + paths = (os.path.join(temp_path, dir) for dir in os.listdir(temp_path)) + self.environments = sorted( + list(filter(lambda path: os.path.isdir(path), paths)), key=lambda x: x.lower()) + + @staticmethod + def extract_node_names(nodes_path, min_count): + logging.info("Extract node names") + return extract_node_names(read_nodes(nodes_path), min_count=min_count) + + def filter_type_edges(self, nodes_path, edges_path): + logging.info("Filter type edges") + filter_type_edges_with_chunks( + nodes_path, edges_path, kwarg_fn=self.get_writing_mode) + + def do_extraction(self): + global_nodes_with_ast = set() + + for env_path in self.environments: + logging.info(f"Found {os.path.basename(env_path)}") + + if not self.recompute_l2g: + + source_code = unpersist(join(env_path, "source_code.bz2")) + + nodes_with_ast, edges_with_ast, offsets = build_ast_only_graph( + zip(source_code["package"], source_code["id"], + source_code["filecontent"]), self.bpe_tokenizer, + create_subword_instances=self.create_subword_instances, connect_subwords=self.connect_subwords, + lang=self.lang, track_offsets=self.track_offsets + ) + + else: + nodes_with_ast = unpersist_if_present( + join(env_path, "nodes_with_ast.bz2")) + + if nodes_with_ast is None: + continue + + edges_with_ast = offsets = source_code = None + + local2global_with_ast = get_local2global( + global_nodes=global_nodes_with_ast, local_nodes=nodes_with_ast + ) + + global_nodes_with_ast.update(local2global_with_ast["global_id"]) + + self.write_type_annotation_flag(edges_with_ast, env_path) + + self.write_local( + env_path, + local2global_with_ast=local2global_with_ast, + nodes_with_ast=nodes_with_ast, edges_with_ast=edges_with_ast, offsets=offsets, + filecontent_with_package=source_code, + ) + + self.compact_mapping_for_l2g( + global_nodes_with_ast, "local2global_with_ast.bz2") + + def create_output_dirs(self, output_path): + if not os.path.isdir(output_path): + os.mkdir(output_path) + + with_ast_path = join(output_path, "with_ast") + + if not os.path.isdir(with_ast_path): + os.mkdir(with_ast_path) + + return with_ast_path + + def merge(self, output_directory): + + if self.extract: + logging.info("Extracting...") + self.do_extraction() + + with_ast_path = self.create_output_dirs(output_directory) + + self.merge_graph_with_ast(with_ast_path) + + def visualize_func(self, nodes, edges, output_path): + visualize(nodes, edges, output_path) + + +if __name__ == "__main__": + + args = AstDatasetCreatorArguments().parse() + + if args.recompute_l2g: + args.do_extraction = True + + logging.basicConfig(level=logging.INFO, + format="%(asctime)s:%(levelname)s:%(message)s") + + dataset = AstDatasetCreator( + args.source_code, args.language, args.bpe_tokenizer, args.create_subword_instances, + args.connect_subwords, args.only_with_annotations, args.do_extraction, args.visualize, args.track_offsets, + args.remove_type_annotations, args.recompute_l2g, args.chunksize, args.keep_frac + ) + dataset.merge(args.output_directory) diff --git a/SourceCodeTools/code/data/ast_graph/draw_graph.py b/SourceCodeTools/code/data/ast_graph/draw_graph.py new file mode 100644 index 00000000..c72369f7 --- /dev/null +++ b/SourceCodeTools/code/data/ast_graph/draw_graph.py @@ -0,0 +1,35 @@ +import pandas as pd + + + +def visualize(nodes, edges, output_path): + import pygraphviz as pgv + + edges = edges[edges["type"].apply(lambda x: not x.endswith("_rev"))] + + id2name = dict(zip(nodes['id'], nodes['serialized_name'])) + + g = pgv.AGraph(strict=False, directed=True) + + from SourceCodeTools.code.ast.python_ast2 import PythonNodeEdgeDefinitions + auxiliaty_edge_types = PythonNodeEdgeDefinitions.auxiliary_edges() + + for ind, edge in edges.iterrows(): + src = edge['source_node_id'] + dst = edge['target_node_id'] + src_name = id2name[src] + dst_name = id2name[dst] + g.add_node(src_name, color="black") + g.add_node(dst_name, color="black") + g.add_edge(src_name, dst_name, color="blue" if edge['type'] in auxiliaty_edge_types else "black") + g_edge = g.get_edge(src_name, dst_name) + g_edge.attr['label'] = edge['type'] + + g.layout("dot") + g.draw(output_path) + + +if __name__ == "__main__": + nodes = pd.read_pickle("common_nodes.bz2") + edges = pd.read_pickle("common_edges.bz2") + visualize(nodes, edges, "test.pdf") \ No newline at end of file diff --git a/SourceCodeTools/code/data/ast_graph/extract_node_names.py b/SourceCodeTools/code/data/ast_graph/extract_node_names.py new file mode 100644 index 00000000..9c37e3dd --- /dev/null +++ b/SourceCodeTools/code/data/ast_graph/extract_node_names.py @@ -0,0 +1,29 @@ +def extract_node_names(nodes, min_count): + + data = nodes.copy().rename({"id": "src", "serialized_name": "dst"}, axis=1) + + corrected_names = [] + for type_, name_ in data[["type", "dst"]].values: + if type_ == "mention": + corrected_names.append(name_.split("@")[0]) + else: + corrected_names.append(name_) + + data["dst"] = corrected_names + + def not_contains(name): + return "0x" not in name + + data = data[ + data["dst"].apply(not_contains) + ] + + counts = data['dst'].value_counts() + + data['counts'] = data['dst'].apply(lambda x: counts[x]) + data = data.query(f"counts >= {min_count}") + + if len(data) > 0: + return data[['src', 'dst']] + else: + return None \ No newline at end of file diff --git a/SourceCodeTools/code/data/ast_graph/filter_type_edges.py b/SourceCodeTools/code/data/ast_graph/filter_type_edges.py new file mode 100644 index 00000000..9f36b6d0 --- /dev/null +++ b/SourceCodeTools/code/data/ast_graph/filter_type_edges.py @@ -0,0 +1,76 @@ +import os +from os.path import join + +from SourceCodeTools.code.common import read_nodes, read_edges +from SourceCodeTools.code.data.file_utils import persist + + +def filter_type_edges(nodes, edges, keep_proportion=0.0): + annotations = edges.query(f"type == 'annotation_for' or type == 'returned_by' or type == 'annotation_for_rev' or type == 'returned_by_rev'") + no_annotations = edges.query(f"type != 'annotation_for' and type != 'returned_by' and type != 'annotation_for_rev' and type != 'returned_by_rev'") + + annotations = annotations.query("type == 'annotation_for' or type == 'returned_by'") + + to_keep = int(len(annotations) * keep_proportion) + if to_keep == 0: + annotations_removed = annotations + annotations_kept = None + elif to_keep == len(annotations): + annotations_removed = None + annotations_kept = annotations + else: + annotations = annotations.sample(frac=1.) + annotations_kept, annotations_removed = annotations.iloc[:to_keep], annotations.iloc[to_keep:] + + if annotations_kept is not None: + no_annotations = no_annotations.append(annotations_kept) + + annotations = annotations_removed + if annotations is not None: + annotations = annotations_removed + node2name = dict(zip(nodes["id"], nodes["serialized_name"])) + get_name = lambda id_: node2name[id_] + annotations["source_node_id"] = annotations["source_node_id"].apply(get_name) + # rename columns to use as a dataset + # annotations.rename({"source_node_id": "dst", "target_node_id": "src"}, axis=1, inplace=True) + annotations.rename({"target_node_id": "src", "type_string": "dst"}, axis=1, inplace=True) + annotations = annotations[["src","dst"]] + + return no_annotations, annotations + + +def filter_type_edges_with_chunks(nodes_path, edges_path, kwarg_fn): + + node2name = {} + for nodes in read_nodes(nodes_path, as_chunks=True): + node2name.update(dict(zip(nodes["id"], nodes["serialized_name"]))) + + temp_edges = join(os.path.dirname(edges_path), "temp_" + os.path.basename(edges_path)) + annotations_path = join(os.path.dirname(edges_path), "type_annotations.json") + + annotations_written = False + + for ind, edges in enumerate(read_edges(edges_path, as_chunks=True)): + annotations = edges.query( + f"type == 'annotation_for' or type == 'returned_by' or type == 'annotation_for_rev' or type == 'returned_by_rev'") + no_annotations = edges.query( + f"type != 'annotation_for' and type != 'returned_by' and type != 'annotation_for_rev' and type != 'returned_by_rev'") + + annotations = annotations.query("type == 'annotation_for' or type == 'returned_by'") + + if annotations is not None and len(annotations) > 0: + annotations["type_string"] = annotations["source_node_id"].apply(node2name.get) + # rename columns to use as a dataset + annotations.rename({"target_node_id": "src", "type_string": "dst"}, axis=1, inplace=True) + annotations = annotations[["src", "dst"]] + + kwargs = kwarg_fn(annotations_path.endswith("csv"), first_written=annotations_written) + persist(annotations, annotations_path, **kwargs) + + annotations_written = True + + kwargs = kwarg_fn(temp_edges.endswith("csv"), first_written=ind != 0) + persist(no_annotations, temp_edges, **kwargs) + + os.remove(edges_path) + os.rename(temp_edges, edges_path) \ No newline at end of file diff --git a/SourceCodeTools/code/data/ast_graph/local2global.py b/SourceCodeTools/code/data/ast_graph/local2global.py new file mode 100644 index 00000000..41c66989 --- /dev/null +++ b/SourceCodeTools/code/data/ast_graph/local2global.py @@ -0,0 +1,45 @@ +import sys + +from SourceCodeTools.code.common import compute_long_id, create_node_repr +from SourceCodeTools.code.data.file_utils import * + + +def create_local_to_global_id_map(local_nodes, global_nodes): + # local_nodes = local_nodes.copy() + # global_nodes = global_nodes.copy() + # + # global_nodes['node_repr'] = create_node_repr(global_nodes) + # local_nodes['node_repr'] = create_node_repr(local_nodes) + # + # rev_id_map = dict(zip( + # global_nodes['node_repr'].tolist(), global_nodes['id'].tolist() + # )) + # id_map = dict(zip( + # local_nodes["id"].tolist(), map( + # lambda x: rev_id_map[x], local_nodes["node_repr"].tolist() + # ) + # )) + id_map = dict(zip( + local_nodes["id"], map(compute_long_id, create_node_repr(local_nodes)) + )) + + return id_map + + +def get_local2global(global_nodes, local_nodes) -> pd.DataFrame: + local_nodes = local_nodes.copy() + id_map = create_local_to_global_id_map(local_nodes=local_nodes, global_nodes=global_nodes) + + local_nodes['global_id'] = local_nodes['id'].apply(lambda x: id_map.get(x, None)) + + return local_nodes[['id', 'global_id']] + + +if __name__ == "__main__": + global_nodes = unpersist_or_exit(sys.argv[1], "Global nodes do not exist!") + local_nodes = unpersist_or_exit(sys.argv[2], "No processed nodes, skipping") + local_map_path = sys.argv[3] + + local2global = get_local2global(global_nodes, local_nodes) + + persist(local2global, local_map_path) diff --git a/SourceCodeTools/code/data/cubert_python_benchmarks/SQLTable.py b/SourceCodeTools/code/data/cubert_python_benchmarks/SQLTable.py new file mode 100644 index 00000000..586c63ab --- /dev/null +++ b/SourceCodeTools/code/data/cubert_python_benchmarks/SQLTable.py @@ -0,0 +1,50 @@ +import sqlite3 + +import pandas as pd + + +class SQLTable: + def __init__(self, filename): + self.conn = sqlite3.connect(filename) + self.path = filename + + def replace_records(self, table, table_name, **kwargs): + table.to_sql(table_name, con=self.conn, if_exists='replace', index=False, method="multi", chunksize=1000, **kwargs) + self.create_index_for_table(table, table_name) + + def add_records(self, table, table_name, **kwargs): + table.to_sql(table_name, con=self.conn, if_exists='append', index=False, method="multi", chunksize=1000, **kwargs) + self.create_index_for_table(table, table_name) + + def create_index_for_table(self, table, table_name): + self.execute( + f""" + CREATE INDEX IF NOT EXISTS idx_{table_name} + ON {table_name}({','.join(repr(col) for col in table.columns)}) + """ + ) + + def create_index_for_columns(self, columns, table_name): + self.execute( + f""" + CREATE INDEX IF NOT EXISTS idx_{table_name} + ON {table_name}({','.join(repr(col) for col in columns)}) + """ + ) + + def query(self, query_string, **kwargs): + return pd.read_sql(query_string, self.conn, **kwargs) + + def execute(self, query_string): + self.conn.execute(query_string) + self.conn.commit() + + def drop_table(self, table_name): + self.conn.execute(f"DROP TABLE IF EXISTS {table_name}") + self.conn.execute(f"DROP INDEX IF EXISTS idx_{table_name}") + self.conn.commit() + + def __del__(self): + self.conn.close() + # if os.path.isfile(self.path): + # os.remove(self.path) \ No newline at end of file diff --git a/SourceCodeTools/code/data/cubert_python_benchmarks/convert_for_ast_graph_builder.py b/SourceCodeTools/code/data/cubert_python_benchmarks/convert_for_ast_graph_builder.py new file mode 100644 index 00000000..6fe7cb11 --- /dev/null +++ b/SourceCodeTools/code/data/cubert_python_benchmarks/convert_for_ast_graph_builder.py @@ -0,0 +1,81 @@ +import bz2 +import json +from pathlib import Path + +import pandas as pd +from tqdm import tqdm + + +def write_chunk(temp, column_order, first_part, output_path): + data = pd.DataFrame.from_records(temp, columns=column_order) + data.rename({"function": "filecontent"}, axis=1, inplace=True) + if first_part is True: + data.to_csv(output_path, index=False) + else: + data.to_csv(output_path, index=False, mode="a", header=False) + + +def convert_bzip(dataset_path, output_path): + assert dataset_path.name.endswith("bz2") + + dataset = bz2.open(dataset_path, mode="rt") + + convert(dataset, output_path) + + +def convert_json(dataset_path, output_path): + assert dataset_path.name.endswith("json") + + dataset = open(dataset_path, mode="r") + + convert(dataset, output_path) + + +def convert(dataset, output_path): + + temp = [] + id_ = 0 + + column_order = None + first_part = True + + for line in tqdm(dataset): + record = json.loads(line) + + if record.pop("parsing_error"): + continue + + if column_order is None: + column_order = list(record.keys()) + column_order.insert(0, "id") + + record["id"] = id_ + id_ += 1 + + temp.append(record) + + if len(temp) > 10000: + write_chunk(temp, column_order, first_part, output_path) + first_part = False + temp.clear() + + if len(temp) > 0: + write_chunk(temp, column_order, first_part, output_path) + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser("Convert CuBERT dataset file from json format to pandas table comparible with AST graph builder") + + parser.add_argument("dataset_path", help="Path to json file (compressed bz2)") + parser.add_argument("output_path") + + args = parser.parse_args() + + dataset_path = Path(args.dataset_path) + output_path = Path(args.output_path) + + if dataset_path.name.endswith("bz2"): + convert_bzip(dataset_path, output_path) + else: + convert_json(dataset_path, output_path) \ No newline at end of file diff --git a/SourceCodeTools/code/data/cubert_python_benchmarks/download.sh b/SourceCodeTools/code/data/cubert_python_benchmarks/download.sh new file mode 100644 index 00000000..65e3f144 --- /dev/null +++ b/SourceCodeTools/code/data/cubert_python_benchmarks/download.sh @@ -0,0 +1,106 @@ +mkdir function_docstring_datasets +gsutil -m cp \ + "gs://cubert/20200621_Python/function_docstring_datasets/dev.jsontxt-00000-of-00004" \ + "gs://cubert/20200621_Python/function_docstring_datasets/dev.jsontxt-00001-of-00004" \ + "gs://cubert/20200621_Python/function_docstring_datasets/dev.jsontxt-00002-of-00004" \ + "gs://cubert/20200621_Python/function_docstring_datasets/dev.jsontxt-00003-of-00004" \ + "gs://cubert/20200621_Python/function_docstring_datasets/eval.jsontxt-00000-of-00004" \ + "gs://cubert/20200621_Python/function_docstring_datasets/eval.jsontxt-00001-of-00004" \ + "gs://cubert/20200621_Python/function_docstring_datasets/eval.jsontxt-00002-of-00004" \ + "gs://cubert/20200621_Python/function_docstring_datasets/eval.jsontxt-00003-of-00004" \ + "gs://cubert/20200621_Python/function_docstring_datasets/train.jsontxt-00000-of-00004" \ + "gs://cubert/20200621_Python/function_docstring_datasets/train.jsontxt-00001-of-00004" \ + "gs://cubert/20200621_Python/function_docstring_datasets/train.jsontxt-00002-of-00004" \ + "gs://cubert/20200621_Python/function_docstring_datasets/train.jsontxt-00003-of-00004" \ + function_docstring_datasets + +mkdir exception_datasets +gsutil -m cp \ + "gs://cubert/20200621_Python/exception_datasets/dev.jsontxt-00000-of-00004" \ + "gs://cubert/20200621_Python/exception_datasets/dev.jsontxt-00001-of-00004" \ + "gs://cubert/20200621_Python/exception_datasets/dev.jsontxt-00002-of-00004" \ + "gs://cubert/20200621_Python/exception_datasets/dev.jsontxt-00003-of-00004" \ + "gs://cubert/20200621_Python/exception_datasets/eval.jsontxt-00000-of-00004" \ + "gs://cubert/20200621_Python/exception_datasets/eval.jsontxt-00001-of-00004" \ + "gs://cubert/20200621_Python/exception_datasets/eval.jsontxt-00002-of-00004" \ + "gs://cubert/20200621_Python/exception_datasets/eval.jsontxt-00003-of-00004" \ + "gs://cubert/20200621_Python/exception_datasets/train.jsontxt-00000-of-00004" \ + "gs://cubert/20200621_Python/exception_datasets/train.jsontxt-00001-of-00004" \ + "gs://cubert/20200621_Python/exception_datasets/train.jsontxt-00002-of-00004" \ + "gs://cubert/20200621_Python/exception_datasets/train.jsontxt-00003-of-00004" \ + exception_datasets + +mkdir variable_misuse_datasets +gsutil -m cp \ + "gs://cubert/20200621_Python/variable_misuse_datasets/dev.jsontxt-00000-of-00004" \ + "gs://cubert/20200621_Python/variable_misuse_datasets/dev.jsontxt-00001-of-00004" \ + "gs://cubert/20200621_Python/variable_misuse_datasets/dev.jsontxt-00002-of-00004" \ + "gs://cubert/20200621_Python/variable_misuse_datasets/dev.jsontxt-00003-of-00004" \ + "gs://cubert/20200621_Python/variable_misuse_datasets/eval.jsontxt-00000-of-00004" \ + "gs://cubert/20200621_Python/variable_misuse_datasets/eval.jsontxt-00001-of-00004" \ + "gs://cubert/20200621_Python/variable_misuse_datasets/eval.jsontxt-00002-of-00004" \ + "gs://cubert/20200621_Python/variable_misuse_datasets/eval.jsontxt-00003-of-00004" \ + "gs://cubert/20200621_Python/variable_misuse_datasets/train.jsontxt-00000-of-00004" \ + "gs://cubert/20200621_Python/variable_misuse_datasets/train.jsontxt-00001-of-00004" \ + "gs://cubert/20200621_Python/variable_misuse_datasets/train.jsontxt-00002-of-00004" \ + "gs://cubert/20200621_Python/variable_misuse_datasets/train.jsontxt-00003-of-00004" \ + variable_misuse_datasets + +mkdir swapped_operands_datasets +gsutil -m cp \ + "gs://cubert/20200621_Python/swapped_operands_datasets/dev.jsontxt-00000-of-00004" \ + "gs://cubert/20200621_Python/swapped_operands_datasets/dev.jsontxt-00001-of-00004" \ + "gs://cubert/20200621_Python/swapped_operands_datasets/dev.jsontxt-00002-of-00004" \ + "gs://cubert/20200621_Python/swapped_operands_datasets/dev.jsontxt-00003-of-00004" \ + "gs://cubert/20200621_Python/swapped_operands_datasets/eval.jsontxt-00000-of-00004" \ + "gs://cubert/20200621_Python/swapped_operands_datasets/eval.jsontxt-00001-of-00004" \ + "gs://cubert/20200621_Python/swapped_operands_datasets/eval.jsontxt-00002-of-00004" \ + "gs://cubert/20200621_Python/swapped_operands_datasets/eval.jsontxt-00003-of-00004" \ + "gs://cubert/20200621_Python/swapped_operands_datasets/train.jsontxt-00000-of-00004" \ + "gs://cubert/20200621_Python/swapped_operands_datasets/train.jsontxt-00001-of-00004" \ + "gs://cubert/20200621_Python/swapped_operands_datasets/train.jsontxt-00002-of-00004" \ + "gs://cubert/20200621_Python/swapped_operands_datasets/train.jsontxt-00003-of-00004" \ + swapped_operands_datasets + +mkdir wrong_binary_operator_datasets +gsutil -m cp \ + "gs://cubert/20200621_Python/wrong_binary_operator_datasets/dev.jsontxt-00000-of-00004" \ + "gs://cubert/20200621_Python/wrong_binary_operator_datasets/dev.jsontxt-00001-of-00004" \ + "gs://cubert/20200621_Python/wrong_binary_operator_datasets/dev.jsontxt-00002-of-00004" \ + "gs://cubert/20200621_Python/wrong_binary_operator_datasets/dev.jsontxt-00003-of-00004" \ + "gs://cubert/20200621_Python/wrong_binary_operator_datasets/eval.jsontxt-00000-of-00004" \ + "gs://cubert/20200621_Python/wrong_binary_operator_datasets/eval.jsontxt-00001-of-00004" \ + "gs://cubert/20200621_Python/wrong_binary_operator_datasets/eval.jsontxt-00002-of-00004" \ + "gs://cubert/20200621_Python/wrong_binary_operator_datasets/eval.jsontxt-00003-of-00004" \ + "gs://cubert/20200621_Python/wrong_binary_operator_datasets/train.jsontxt-00000-of-00004" \ + "gs://cubert/20200621_Python/wrong_binary_operator_datasets/train.jsontxt-00001-of-00004" \ + "gs://cubert/20200621_Python/wrong_binary_operator_datasets/train.jsontxt-00002-of-00004" \ + "gs://cubert/20200621_Python/wrong_binary_operator_datasets/train.jsontxt-00003-of-00004" \ + wrong_binary_operator_datasets + +mkdir variable_misuse_repair_datasets +gsutil -m cp \ + "gs://cubert/20200621_Python/variable_misuse_repair_datasets/dev.jsontxt-00000-of-00004" \ + "gs://cubert/20200621_Python/variable_misuse_repair_datasets/dev.jsontxt-00001-of-00004" \ + "gs://cubert/20200621_Python/variable_misuse_repair_datasets/dev.jsontxt-00002-of-00004" \ + "gs://cubert/20200621_Python/variable_misuse_repair_datasets/dev.jsontxt-00003-of-00004" \ + "gs://cubert/20200621_Python/variable_misuse_repair_datasets/eval.jsontxt-00000-of-00006" \ + "gs://cubert/20200621_Python/variable_misuse_repair_datasets/eval.jsontxt-00001-of-00006" \ + "gs://cubert/20200621_Python/variable_misuse_repair_datasets/eval.jsontxt-00002-of-00006" \ + "gs://cubert/20200621_Python/variable_misuse_repair_datasets/eval.jsontxt-00003-of-00006" \ + "gs://cubert/20200621_Python/variable_misuse_repair_datasets/eval.jsontxt-00004-of-00006" \ + "gs://cubert/20200621_Python/variable_misuse_repair_datasets/eval.jsontxt-00005-of-00006" \ + "gs://cubert/20200621_Python/variable_misuse_repair_datasets/githubcommits.jsontxt-00000-of-00001" \ + "gs://cubert/20200621_Python/variable_misuse_repair_datasets/githubcommits.raw.jsontxt-00000-of-00001" \ + "gs://cubert/20200621_Python/variable_misuse_repair_datasets/train.jsontxt-00000-of-00011" \ + "gs://cubert/20200621_Python/variable_misuse_repair_datasets/train.jsontxt-00001-of-00011" \ + "gs://cubert/20200621_Python/variable_misuse_repair_datasets/train.jsontxt-00002-of-00011" \ + "gs://cubert/20200621_Python/variable_misuse_repair_datasets/train.jsontxt-00003-of-00011" \ + "gs://cubert/20200621_Python/variable_misuse_repair_datasets/train.jsontxt-00004-of-00011" \ + "gs://cubert/20200621_Python/variable_misuse_repair_datasets/train.jsontxt-00005-of-00011" \ + "gs://cubert/20200621_Python/variable_misuse_repair_datasets/train.jsontxt-00006-of-00011" \ + "gs://cubert/20200621_Python/variable_misuse_repair_datasets/train.jsontxt-00007-of-00011" \ + "gs://cubert/20200621_Python/variable_misuse_repair_datasets/train.jsontxt-00008-of-00011" \ + "gs://cubert/20200621_Python/variable_misuse_repair_datasets/train.jsontxt-00009-of-00011" \ + "gs://cubert/20200621_Python/variable_misuse_repair_datasets/train.jsontxt-00010-of-00011" \ + variable_misuse_repair_datasets \ No newline at end of file diff --git a/SourceCodeTools/code/data/cubert_python_benchmarks/extract_partition.py b/SourceCodeTools/code/data/cubert_python_benchmarks/extract_partition.py new file mode 100644 index 00000000..e4ae5aa0 --- /dev/null +++ b/SourceCodeTools/code/data/cubert_python_benchmarks/extract_partition.py @@ -0,0 +1,28 @@ +import os.path + +from SourceCodeTools.code.data.file_utils import unpersist, persist + + +def extract_partitions(path): + filecontent = unpersist(path) + + items = filecontent[["id"]] + masks = filecontent["partition"].values + + # create masks + items["train_mask"] = masks == "train" + items["val_mask"] = masks == "dev" + items["test_mask"] = masks == "eval" + + dirname = os.path.dirname(path) + persist(items, os.path.join(dirname, "partition.json")) + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("filecontent") + + args = parser.parse_args() + + extract_partitions(args.filecontent) \ No newline at end of file diff --git a/SourceCodeTools/code/data/cubert_python_benchmarks/partitioning.py b/SourceCodeTools/code/data/cubert_python_benchmarks/partitioning.py new file mode 100644 index 00000000..39fd1eab --- /dev/null +++ b/SourceCodeTools/code/data/cubert_python_benchmarks/partitioning.py @@ -0,0 +1,87 @@ +import json +from functools import partial +from os.path import join +from random import random + +from SourceCodeTools.code.common import read_edges, read_nodes +from SourceCodeTools.code.data.file_utils import persist + + +def add_splits(items, train_frac, restricted_id_pool=None): + items = items.copy() + + def random_partition(): + r = random() + if r < train_frac: + return "train" + elif r < train_frac + (1 - train_frac) / 2: + return "val" + else: + return "test" + + import numpy as np + # define partitioning + masks = np.array([random_partition() for _ in range(len(items))]) + + # create masks + items["train_mask"] = masks == "train" + items["val_mask"] = masks == "val" + items["test_mask"] = masks == "test" + + if restricted_id_pool is not None: + # if `restricted_id_pool` is provided, mask all nodes not in `restricted_id_pool` negatively + to_keep = items.eval("id in @restricted_ids", local_dict={"restricted_ids": restricted_id_pool}) + items["train_mask"] = items["train_mask"] & to_keep + items["test_mask"] = items["test_mask"] & to_keep + items["val_mask"] = items["val_mask"] & to_keep + + return items + + +def subgraph_partitioning(path_to_dataset, partition_column, train_frac=0.8): + + get_path = partial(join, path_to_dataset) + + # nodes = read_nodes(get_path("common_nodes.json.bz2")) + edges = read_edges(get_path("common_edges.json.bz2")) + + subgraph_ids = add_splits( + edges[[partition_column]].dropna(axis=0).drop_duplicates(), train_frac=train_frac + ).rename({partition_column: "id"}, axis=1) + + valid_subgraph_ids = set(subgraph_ids["id"]) + + persist(subgraph_ids, get_path("subgraph_partition.json")) + + edges = edges[["source_node_id", "target_node_id", partition_column]].sort_values(partition_column) + + last_subgraph_id = -1 + pool = set() + + with open(get_path("subgraph_mapping.json"), "w") as subgraph_mapping: + for src, dst, subgraph_id in edges[["source_node_id", "target_node_id", partition_column]].values: + if subgraph_id in valid_subgraph_ids: + if subgraph_id != last_subgraph_id and last_subgraph_id != -1: + record = json.dumps({ + "subgraph_id": last_subgraph_id, + "node_ids": list(pool) + }) + subgraph_mapping.write(f"{record}\n") + + pool.clear() + + pool.add(src) + pool.add(dst) + last_subgraph_id = subgraph_id + + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("dataset_path") + parser.add_argument("subgraph_column") + + args = parser.parse_args() + + subgraph_partitioning(args.dataset_path, args.subgraph_column) \ No newline at end of file diff --git a/SourceCodeTools/code/data/cubert_python_benchmarks/prepare_for_ast_parser.py b/SourceCodeTools/code/data/cubert_python_benchmarks/prepare_for_ast_parser.py new file mode 100644 index 00000000..e5e6b504 --- /dev/null +++ b/SourceCodeTools/code/data/cubert_python_benchmarks/prepare_for_ast_parser.py @@ -0,0 +1,397 @@ +import hashlib + +# import pandas as pd +import ast +import json +from pathlib import Path +from nltk import RegexpTokenizer +from itertools import chain + +from tqdm import tqdm + +from SourceCodeTools.code.data.cubert_python_benchmarks.SQLTable import SQLTable + + +class CodeTokenizer: + tok = RegexpTokenizer("\w+|\s+|\W+") + + @classmethod + def tokenize(cls, code): + return cls.tok.tokenize(code) + + @classmethod + def detokenize(cls, code_tokens): + return "".join(code_tokens) + + +def test_CodeTokenizer(): + code = """ +data = pd.DataFrame2.from_records(json.loads(line) for line in open(filename).readlines()) + +success = 0 +errors = 0 +""" + assert CodeTokenizer.detokenize(CodeTokenizer.tokenize(code)) == code + + +test_CodeTokenizer() + + +class Info: + """ + Example + dataset/ETHPy150Open EricssonResearch/calvin-base/calvin/requests/request_handler.py RequestHandler.get_index/VarMisuse@32/36 `async`->`self` + """ + def __init__(self, info): + self.info = info + parts = info.split(" ") + self.parse_filepath(parts) + self.parse_fn_path(parts) + self.parse_state(parts) + self.parse_package(parts) + self.set_id() + + def parse_filepath(self, parts): + self.filepath = " ".join(self.info.split(".py")[0].split(" ")[1:]) + ".py" + # self.filepath = parts[1].split(".py")[0] + ".py" + + def parse_fn_path(self, parts): + self.fn_path = self.filepath + "/" + self.info.split(".py")[1].lstrip("/").lstrip(" ").split("/")[0] + + def parse_state(self, parts): + self.state = "/".join(self.info.split(".py ")[1].lstrip("/").lstrip(" ").split("/")[1:]) + + def parse_package(self, parts): + self.package = self.filepath.split("/")[0] + + def set_id(self): + identifier = "\t".join([self.fn_path, self.state]) + self.id = hashlib.md5(identifier.encode('utf-8')).hexdigest() + + oidentifier = "\t".join([self.fn_path, "original"]) + + if self.state.endswith("riginal"): + assert identifier == oidentifier + self.original_id = hashlib.md5(oidentifier.encode('utf-8')).hexdigest() + + def __repr__(self): + return self.info + + +class DatasetAdapter: + replacements = { + "async": "async_", + "await": "await_" + } + + fields = { + "variable_misuse": ["function", "label", "info"], + "variable_misuse_repair": ["function", "target_mask", "error_location_mask", "candidate_mask", "provenance"], + "exception": ["function", "label", "info"], + "function_docstring": ["function", "docstring", "label", "info"], + "swapped_operands": ["function", "label", "info"], + "wrong_binary_operator": ["function", "label", "info"] + } + supported_partitions = ["train", "dev", "eval"] # + + benchmark_names = { + "exception": "exception_datasets", + "function_docstring": "function_docstring_datasets", + "swapped_operands": "swapped_operands_datasets", + "variable_misuse": "variable_misuse_dataset", + "variable_misuse_repair": "variable_misuse_repair_datasets", + "wrong_binary_operator": "wrong_binary_operator_datasets" + } + + preferred_column_order = ["id", "package", "function", "info", "label", "partition"] + + import_order = [ + "variable_misuse", + # "variable_misuse_repair", + # "exception", + # "function_docstring", + # "swapped_operands", + # "wrong_binary_operator" + ] + + # expected_directory_structure = "dev.jsontxt-00000-of-00004\n" + # "dev.jsontxt-00001-of-00004\n" + # "dev.jsontxt-00002-of-00004\n" + # "dev.jsontxt-00003-of-00004\n" + # "eval.jsontxt-00000-of-00004\n" + # "eval.jsontxt-00001-of-00004\n" + # "eval.jsontxt-00002-of-00004\n" + # "eval.jsontxt-00003-of-00004\n" + # "train.jsontxt-00000-of-00004\n" + # "train.jsontxt-00001-of-00004\n" + # "train.jsontxt-00002-of-00004\n" + # "train.jsontxt-00003-of-00004\n" + + def __init__(self, dataset_location): + self.dataset_location = Path(dataset_location) + + self.replacement_fns = { + "function": self.fix_code_if_needed, + "info": self.fix_info_if_needed + } + + self.preprocess = { + # "variable_misuse_repair": { + # "function": self.cubert_detokenize + # } + } + + self.extra_fields = { + "info": [("package", self.get_package)], + "provenance": [("info", self.fix_info_if_needed)] + } + + # self.db = SQLTable(self.dataset_location.joinpath("cubert_benchmarks.db")) + + # def load_original_functions(self): + # functions = self.db.query("SELECT DISTINCT original_id, function FROM functions where comment = 'original' AND dataset = 'variable_misuse'") + # self.original_functions = dict(zip(functions["original_id"], functions["function"])) + + def prepare_misuse_repair_record(self, record): + print(record) + + @staticmethod + def get_source_from_ast_range(node, function, strip=True): + lines = function.split("\n") + start_line = node.lineno + end_line = node.end_lineno + start_col = node.col_offset + end_col = node.end_col_offset + + source = "" + num_lines = end_line - start_line + 1 + if start_line == end_line: + section = lines[start_line - 1].encode("utf8")[start_col:end_col].decode( + "utf8") + source += section.strip() if strip else section + "\n" + else: + for ind, lineno in enumerate(range(start_line - 1, end_line)): + if ind == 0: + section = lines[lineno].encode("utf8")[start_col:].decode( + "utf8") + source += section.strip() if strip else section + "\n" + elif ind == num_lines - 1: + section = lines[lineno].encode("utf8")[:end_col].decode( + "utf8") + source += section.strip() if strip else section + "\n" + else: + section = lines[lineno] + source += section.strip() if strip else section + "\n" + + return source.rstrip() + + def get_dispatch(self, function): + root = ast.parse(function) + return self.get_source_from_ast_range(root.body[0].decorator_list[0], function) + + @staticmethod + def remove_indent(code): + lines = code.strip("\n").split("\n") + first_line_indent = lines[0][:len(lines[0]) - len(lines[0].lstrip())] + start_char = len(first_line_indent) + if start_char != 0: + clean = "\n".join(line[start_char:] if line.startswith(first_line_indent) else line for line in lines) + else: + if lines[0].lstrip().startswith("@"): + for ind, line in enumerate(lines): + stripped = line.lstrip() + if stripped.startswith("def "): + lines[ind] = stripped + break + clean = "\n".join(lines) + return clean + + @classmethod + def fix_code_if_needed(cls, code): + # f = code.lstrip() + f = cls.remove_indent(code) + try: + ast.parse(f) + except Exception as e: + tokens = CodeTokenizer.tokenize(f) + recovered_tokens = [cls.replacements[token] if token in cls.replacements else token for token in tokens] + f = CodeTokenizer.detokenize(recovered_tokens) + ast.parse(f) + return f + + @classmethod + def fix_info_if_needed(cls, info): + if not info.endswith("original"): + parts = info.split(" ") + variable_replacement = parts[-1] + + variable_replacement = CodeTokenizer.detokenize( + cls.replacements[token] if token in cls.replacements else token + for token in CodeTokenizer.tokenize(variable_replacement) + ) + + parts[-1] = variable_replacement + + info = " ".join(parts) + return info + + @staticmethod + def get_package(info): + return info.split(" ")[1].split("/")[0] + + @classmethod + def sort_columns(cls, columns): + return sorted(columns, key=cls.preferred_column_order.index) + + def process_record(self, record, preprocess_fns): + new_record = {} + for field, data in record.items(): + if preprocess_fns is not None and field in preprocess_fns: + record[field] = preprocess_fns[field](data) + + new_record[field] = data if field not in self.replacement_fns else self.replacement_fns[field](data) + + for new_field, new_field_fn in self.extra_fields.get(field, []): + new_record[new_field] = new_field_fn(data) + + return new_record + + @classmethod + def stream_original_partition(cls, directory, partition, *args, **kwargs): + directory = Path(directory) + assert partition in cls.supported_partitions, f"Only the partitions should be one of: {cls.supported_partitions}, but {partition} given" + for file in directory.iterdir(): + if file.name.startswith(partition): + with open(file) as p: + for line in p: + yield json.loads(line) + + def stream_processed_partition(self, directory, partition, add_partition=False, preprocess_fns=None): + for record in self.stream_original_partition(directory, partition): + try: + r = self.process_record(record, preprocess_fns=preprocess_fns) + r["parsing_error"] = None + except Exception as e: + r = record + r["parsing_error"] = e.msg if hasattr(e, "msg") else e.__class__.__name__ + r["package"] = self.get_package(r["info"]) + # except MemoryError: # there are two functions that cause this error + # continue + # except SyntaxError as e: + # continue + if add_partition: + r["partition"] = partition + yield r + + # @classmethod + # def process_dataset(cls, original_data_location, output_location): + # partitions = ["train", "dev", "eval"] + # + # last_id = 0 + # column_order = None + # + # data = pd.DataFrame.from_records( + # chain( + # *(cls.stream_processed_partition(original_data_location, partition, add_partition=True) for partition in + # partitions)), + # # columns = column_order + # ) + # data["id"] = range(len(data)) + # # data.to_pickle(output_location, index=False, columns=cls.sort_columns(data.columns)) + # data.to_pickle(output_location) + + def iterate_dataset(self, dataset_name): + for partition in self.supported_partitions: + for record in self.stream_processed_partition( + self.dataset_location.joinpath(self.benchmark_names[dataset_name]), partition, + preprocess_fns=self.preprocess.get(dataset_name, None) + ): + record["partition"] = partition + yield record + + def import_data(self): + + functions = [] + + # added_original = set() + + for dataset_name in self.import_order: + + parsed_successfully = 0 + parsed_with_errors = 0 + + dataset_file = open(self.dataset_location.joinpath(f"{dataset_name}.json"), "w") + + for record in tqdm(self.iterate_dataset(dataset_name), desc=f"Processing {dataset_name}"): + info = Info(record["info"]) + + # if info.id != "12101e677d30de9462596e7894d2bbd1": + # continue + + # if dataset_name == "variable_misuse_repair": + # record = self.prepare_misuse_repair_record(record) + # if record["function"].startswith("@dispatch"): + # dispatch = self.get_dispatch(record["function"]) + # info.fn_path += f"@{dispatch}" + # info.set_id() + + record_for_writing = { + # "id": info.id, + # "original_id": info.original_id, + # "filepath": info.filepath, + "fn_path": info.fn_path, + "function": record["function"], + "comment": info.state, + "package": info.package, + "label": record["label"], + # "dataset": dataset_name, + "partition": record["partition"], + "parsing_error": record["parsing_error"] + } + + if dataset_name == "variable_misuse": + assert record_for_writing["comment"].startswith("original") or record_for_writing["comment"].startswith("VarMisuse") + + if record["parsing_error"] is None: + parsed_successfully += 1 + dataset_file.write(f"{json.dumps(record_for_writing)}\n") + else: + parsed_with_errors += 1 + + dataset_file.close() + print(f"{dataset_name}: success {parsed_successfully} error {parsed_with_errors}") + # functions.append(record_for_writing) + # + # if len(functions) > 100000: + # self.db.add_records(pd.DataFrame.from_records(functions), "functions") + # functions.clear() + # + # if len(functions) > 0: + # self.db.add_records(pd.DataFrame.from_records(functions), "functions") + # functions.clear() + + + +def test_fix_info_if_needed(): + example_info = "dataset/ETHPy150Open EricssonResearch/calvin-base/calvin/requests/request_handler.py RequestHandler.get_index/VarMisuse@32/36 `async`->`self`" + assert DatasetAdapter.fix_info_if_needed( + example_info) == "dataset/ETHPy150Open EricssonResearch/calvin-base/calvin/requests/request_handler.py RequestHandler.get_index/VarMisuse@32/36 `async_`->`self`" + + example_info = "dataset/ETHPy150Open EricssonResearch/calvin-base/calvin/requests/request_handler.py RequestHandler.get_index/VarMisuse@32/36 `async_call`->`self`" + assert DatasetAdapter.fix_info_if_needed( + example_info) == "dataset/ETHPy150Open EricssonResearch/calvin-base/calvin/requests/request_handler.py RequestHandler.get_index/VarMisuse@32/36 `async_call`->`self`" + + +test_fix_info_if_needed() + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser("Convert CuBERT's variable misuse detection dataset for further processing") + parser.add_argument("dataset_path", help="Path to dataset folder") + # parser.add_argument("output_path", help="Path to output file") + args = parser.parse_args() + + dataset = DatasetAdapter(args.dataset_path) + dataset.import_data() + # DatasetAdapter.process_dataset(args.dataset_path, args.output_path) \ No newline at end of file diff --git a/SourceCodeTools/code/data/cubert_python_benchmarks/variable_misuse_node_level_labels.py b/SourceCodeTools/code/data/cubert_python_benchmarks/variable_misuse_node_level_labels.py new file mode 100644 index 00000000..8d567ca9 --- /dev/null +++ b/SourceCodeTools/code/data/cubert_python_benchmarks/variable_misuse_node_level_labels.py @@ -0,0 +1,71 @@ +from os.path import join + +import pandas as pd +from tqdm import tqdm + +from SourceCodeTools.code.data.file_utils import unpersist, persist + + +def get_node_labels(dataset_path): + filecontent = unpersist(join(dataset_path, "common_filecontent.json")) + nodes = unpersist(join(dataset_path, "common_nodes.json")) + edges = unpersist(join(dataset_path, "common_edges.json")) + + id2comment = dict(zip(filecontent["id"], filecontent["comment"])) + nodeid2name = dict(zip(nodes["id"], nodes["serialized_name"])) + nodeid2type = dict(zip(nodes["id"], nodes["type"])) + + edges.sort_values("file_id", inplace=True) + + del nodes, filecontent + + misuse_labels = [] + + for file_id, file_edges in tqdm(edges.groupby("file_id")): + comment = id2comment[file_id] + + if comment is None: + continue + + if comment.startswith("original"): + continue + + misused_vars = set(comment.split(" ")[-1].strip("`").split("`->`")) + + if len(misused_vars) != 2: + print(f"Error in {file_id}") + continue + + all_nodes = set(file_edges["source_node_id"].append(file_edges["target_node_id"])) + + for node_id in all_nodes: + if nodeid2type[node_id] == "mention": + node_name = nodeid2name[node_id].split("@")[0] + if node_name in misused_vars: + misuse_labels.append({ + "src": node_id, + "dst": "misused" + }) + else: + misuse_labels.append({ + "src": node_id, + "dst": "correct" + }) + + labels = pd.DataFrame.from_records(misuse_labels) + + persist(labels, join(dataset_path, "misuse_labels.json")) + + + + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("dataset_path") + + args = parser.parse_args() + + get_node_labels(args.dataset_path) \ No newline at end of file diff --git a/SourceCodeTools/code/data/dataset/Dataset.py b/SourceCodeTools/code/data/dataset/Dataset.py new file mode 100644 index 00000000..e43a0cbd --- /dev/null +++ b/SourceCodeTools/code/data/dataset/Dataset.py @@ -0,0 +1,1149 @@ +# from collections import Counter +# from itertools import chain +from collections import Counter +from typing import List, Optional + +import pandas +import numpy +import pickle + +from os.path import join + +from SourceCodeTools.code.data.dataset.SubwordMasker import SubwordMasker, NodeNameMasker, NodeClfMasker +from SourceCodeTools.code.data.dataset.reader import load_data +from SourceCodeTools.code.data.file_utils import * +from SourceCodeTools.code.ast.python_ast import PythonSharedNodes +from SourceCodeTools.nlp.embed.bpe import make_tokenizer, load_bpe_model +from SourceCodeTools.tabular.common import compact_property +from SourceCodeTools.code.data.sourcetrail.sourcetrail_types import node_types +from SourceCodeTools.code.data.sourcetrail.sourcetrail_extract_node_names import extract_node_names + + +def filter_dst_by_freq(elements, freq=1): + counter = Counter(elements["dst"]) + allowed = {item for item, count in counter.items() if count >= freq} + target = elements.query("dst in @allowed", local_dict={"allowed": allowed}) + return target + + +class SourceGraphDataset: + g = None + nodes = None + edges = None + node_types = None + edge_types = None + + train_frac = None + random_seed = None + labels_from = None + use_node_types = None + use_edge_types = None + filter_edges = None + self_loops = None + + def __init__( + self, data_path: Union[str, Path], use_node_types: bool = False, use_edge_types: bool = False, + filter_edges: Optional[List[str]] = None, self_loops: bool = False, + train_frac: float = 0.6, random_seed: Optional[int] = None, tokenizer_path: Union[str, Path] = None, + min_count_for_objectives: int = 1, + no_global_edges: bool = False, remove_reverse: bool = False, custom_reverse: Optional[List[str]] = None, + # package_names: Optional[List[str]] = None, + restricted_id_pool: Optional[List[int]] = None, use_ns_groups: bool = False, + subgraph_id_column=None, subgraph_partition=None + ): + """ + Prepares the data for training GNN model. The graph is prepared in the following way: + 1. Edges are split into the train set and holdout set. Holdout set is used in the future experiments. + Without holdout set the results of the future experiments may be biased. After removing holdout edges + from the main graph, the disconnected nodes are filtered, so that he graph remain connected. + 2. Since training objective will be defined on the node embeddings, the nodes are split into train, test, + and validation sets. The test set should be used in the future experiments for training. Validation and + test sets are equal in size and constitute 40% of all nodes. + 3. The default label is assumed to be node type. Node types can be incorporated into the model by setting + node_types flag to True. + 4. Graphs require contiguous indexing of nodes. For this reason additional mapping is created that tracks + the relationship between the new graph id and the original node id from the training data. + :param data_path: path to the directory with dataset files stored in `bz2` format + :param use_node_types: whether to use node types in the graph + :param use_edge_types: whether to use edge types in the graph + :param filter_edges: edge types to be removed from the graph + :param self_loops: whether to include self-loops + :param train_frac: fraction of the nodes that will be used for training + :param random_seed: seed for generating random splits + :param tokenizer_path: path to bpe tokenizer, needed to process op names correctly + :param min_count_for_objectives: minimum degree of nodes, after which they are excluded from training data + :param no_global_edges: whether to remove global edges from the dataset. + :param remove_reverse: whether to remove reverse edges from the dataset + :param custom_reverse: list of edges for which reverse types should be added. Used together with `remove_reverse` + :param package_names: list of packages that should be used for partitioning into train and test sets. Used to + draw a solid distinction between code in train and test sets + :param restricted_id_pool: path to csv file with column `node_id` that stores nodes that should be involved into + training and testing + :param use_ns_groups: currently not used + + """ + self.random_seed = random_seed + self.nodes_have_types = use_node_types + self.edges_have_types = use_edge_types + self.data_path = data_path + self.tokenizer_path = tokenizer_path + self.min_count_for_objectives = min_count_for_objectives + self.no_global_edges = no_global_edges + self.remove_reverse = remove_reverse + self.custom_reverse = custom_reverse + self.subgraph_id_column = subgraph_id_column + self.subgraph_partition = subgraph_partition + + self.use_ns_groups = use_ns_groups + + nodes_path = join(data_path, "common_nodes.json.bz2") + edges_path = join(data_path, "common_edges.json.bz2") + + self.nodes, self.edges = load_data(nodes_path, edges_path) + + # self.nodes, self.edges, self.holdout = self.holdout(self.nodes, self.edges) + + # index is later used for sampling and is assumed to be unique + assert len(self.nodes) == len(self.nodes.index.unique()) + assert len(self.edges) == len(self.edges.index.unique()) + + if self_loops: + self.nodes, self.edges = SourceGraphDataset._assess_need_for_self_loops( + self.nodes, self.edges) + + if filter_edges is not None: + for e_type in filter_edges: + logging.info(f"Filtering edge type {e_type}") + self.edges = self.edges.query(f"type != '{e_type}'") + + if self.remove_reverse: + self._remove_reverse_edges() + + if self.no_global_edges: + self._remove_global_edges() + + if self.custom_reverse is not None: + self._add_custom_reverse() + + if use_node_types is False and use_edge_types is False: + new_nodes, new_edges = self._create_nodetype_edges( + self.nodes, self.edges) + self.nodes = self.nodes.append(new_nodes, ignore_index=True) + self.edges = self.edges.append(new_edges, ignore_index=True) + + self.nodes['type_backup'] = self.nodes['type'] + if not self.nodes_have_types: + self.nodes['type'] = "node_" + self.nodes = self.nodes.astype({'type': 'category'}) + + self._add_embedding_names() + # self._add_embeddable_flag() + + # need to do this to avoid issues insode dgl library + self.edges['type'] = self.edges['type'].apply(lambda x: f"{x}_") + self.edges['type_backup'] = self.edges['type'] + if not self.edges_have_types: + self.edges['type'] = "edge_" + self.edges = self.edges.astype({'type': 'category'}) + + # compact labels + # self.nodes['label'] = self.nodes[label_from] + # self.nodes = self.nodes.astype({'label': 'category'}) + # self.label_map = compact_property(self.nodes['label']) + # assert any(pandas.isna(self.nodes['label'])) is False + + logging.info( + f"Unique nodes: {len(self.nodes)}, node types: {len(self.nodes['type'].unique())}") + logging.info( + f"Unique edges: {len(self.edges)}, edge types: {len(self.edges['type'].unique())}") + + # self.nodes, self.label_map = self.add_compact_labels() + self._add_typed_ids() + + self._add_splits(train_frac=train_frac, + package_names=None, # package_names, + restricted_id_pool=restricted_id_pool) + + # self.mark_leaf_nodes() + + self._create_hetero_graph() + + self._update_global_id() + + self.nodes.sort_values('global_graph_id', inplace=True) + + def _add_embedding_names(self): + self.nodes["embeddable"] = True + self.nodes["embeddable_name"] = self.nodes["name"].apply( + self.get_embeddable_name) + + def _add_embeddable_flag(self): + embeddable_types = PythonSharedNodes.shared_node_types | set( + list(node_types.values())) + + if len(self.nodes.query("type_backup == 'subword'")) > 0: + # some of the types should not be embedded if subwords were generated + embeddable_types = embeddable_types - PythonSharedNodes.tokenizable_types + + embeddable_types |= {"node_type"} + + # self.nodes['embeddable'] = False + self.nodes.eval( + "embeddable = type_backup in @embeddable_types", + local_dict={"embeddable_types": embeddable_types}, + inplace=True + ) + + self.nodes["embeddable_name"] = self.nodes["name"].apply( + self.get_embeddable_name) + + def _op_tokens(self): + if self.tokenizer_path is None: + from SourceCodeTools.code.ast.python_tokens_to_bpe_subwords import python_ops_to_bpe + logging.info("Using heuristic tokenization for ops") + + # def op_tokenize(op_name): + # return python_ops_to_bpe[op_name] if op_name in python_ops_to_bpe else None + return python_ops_to_bpe + else: + # from SourceCodeTools.code.python_tokens_to_bpe_subwords import op_tokenize_or_none + + tokenizer = make_tokenizer(load_bpe_model(self.tokenizer_path)) + + # def op_tokenize(op_name): + # return op_tokenize_or_none(op_name, tokenizer) + + from SourceCodeTools.code.ast.python_tokens_to_bpe_subwords import python_ops_to_literal + return { + op_name: tokenizer(op_literal) + for op_name, op_literal in python_ops_to_literal.items() + } + + # self.nodes.eval("name_alter_tokens = name.map(@op_tokenize)", + # local_dict={"op_tokenize": op_tokenize}, inplace=True) + + def _add_splits(self, train_frac, package_names=None, restricted_id_pool=None): + """ + Generates train, validation, and test masks + Store the masks is pandas table for nodes + :param train_frac: + :return: + """ + + self.nodes.reset_index(drop=True, inplace=True) + assert len(self.nodes.index) == self.nodes.index.max() + 1 + # generate splits for all nodes, additional filtering will be applied later + # by an objective + + if package_names is None: + splits = self.get_train_val_test_indices( + self.nodes.index, + train_frac=train_frac, random_seed=self.random_seed + ) + else: + splits = self.get_train_val_test_indices_on_packages( + self.nodes, package_names, + train_frac=train_frac, random_seed=self.random_seed + ) + + self.create_train_val_test_masks(self.nodes, *splits) + + if restricted_id_pool is not None: + node_ids = set(pd.read_csv(restricted_id_pool)["node_id"].tolist()) | \ + set(self.nodes.query( + "type_backup == 'FunctionDef' or type_backup == 'mention'")["id"].tolist()) + to_keep = self.nodes["id"].apply(lambda id_: id_ in node_ids) + self.nodes["train_mask"] = self.nodes["train_mask"] & to_keep + self.nodes["test_mask"] = self.nodes["test_mask"] & to_keep + self.nodes["val_mask"] = self.nodes["val_mask"] & to_keep + + def _add_typed_ids(self): + nodes = self.nodes.copy() + + typed_id_map = {} + + # node_types = dict(zip(self.node_types['int_type'], self.node_types['str_type'])) + + for type_ in nodes['type'].unique(): + type_mask = nodes['type'] == type_ + id_map = compact_property(nodes.loc[type_mask, 'id']) + nodes.loc[type_mask, 'typed_id'] = nodes.loc[type_mask, + 'id'].apply(lambda old_id: id_map[old_id]) + typed_id_map[type_] = id_map + + assert any(pandas.isna(nodes['typed_id'])) is False + + nodes = nodes.astype({"typed_id": "int"}) + + self.nodes, self.typed_id_map = nodes, typed_id_map + # return nodes, typed_id_map + + # def add_compact_labels(self): + # nodes = self.nodes.copy() + # label_map = compact_property(nodes['label']) + # nodes['compact_label'] = nodes['label'].apply(lambda old_id: label_map[old_id]) + # return nodes, label_map + + @staticmethod + def _add_node_types_to_edges(nodes, edges): + + # nodes = self.nodes + # edges = self.edges.copy() + + node_type_map = dict(zip(nodes['id'].values, nodes['type'])) + + edges['src_type'] = edges['src'].apply( + lambda src_id: node_type_map[src_id]) + edges['dst_type'] = edges['dst'].apply( + lambda dst_id: node_type_map[dst_id]) + edges = edges.astype({'src_type': 'category', 'dst_type': 'category'}) + + return edges + + @staticmethod + def _create_nodetype_edges(nodes, edges): + node_new_id = nodes["id"].max() + 1 + edge_new_id = edges["id"].max() + 1 + + new_nodes = [] + new_edges = [] + added_type_nodes = {} + + node_slice = nodes[["id", "type"]].values + + for id, type in node_slice: + if type not in added_type_nodes: + added_type_nodes[type] = node_new_id + node_new_id += 1 + + new_nodes.append({ + "id": added_type_nodes[type], + "name": f"##node_type_{type}", + "type": "node_type", + "mentioned_in": pd.NA + }) + + new_edges.append({ + "id": edge_new_id, + "type": "node_type", + "src": added_type_nodes[type], + "dst": id, + "file_id": pd.NA, + "mentioned_in": pd.NA + }) + edge_new_id += 1 + + return pd.DataFrame(new_nodes), pd.DataFrame(new_edges) + + def _remove_ast_edges(self): + global_edges = self.get_global_edges() + global_edges.add("subword") + def is_global(type): return type in global_edges + edges = self.edges.query( + "type_backup.map(@is_global)", local_dict={"is_global": is_global}) + self.nodes, self.edges = self.ensure_connectedness(self.nodes, edges) + + def _remove_global_edges(self): + global_edges = self.get_global_edges() + # global_edges.add("global_mention") + # global_edges |= set(edge + "_rev" for edge in global_edges) + def is_ast(type): return type not in global_edges + edges = self.edges.query( + "type.map(@is_ast)", local_dict={"is_ast": is_ast}) + self.edges = edges + # self.nodes, self.edges = ensure_connectedness(self.nodes, edges) + + def _remove_reverse_edges(self): + from SourceCodeTools.code.data.sourcetrail.sourcetrail_types import special_mapping + # TODO test this change + global_reverse = {key for key, val in special_mapping.items()} + + def not_reverse(type): return not ( + type.endswith("_rev") or type in global_reverse) + edges = self.edges.query( + "type.map(@not_reverse)", local_dict={"not_reverse": not_reverse}) + self.edges = edges + + def _add_custom_reverse(self): + to_reverse = self.edges[ + self.edges["type"].apply( + lambda type_: type_ in self.custom_reverse) + ] + + to_reverse["type"] = to_reverse["type"].apply( + lambda type_: type_ + "_rev") + tmp = to_reverse["src"] + to_reverse["src"] = to_reverse["dst"] + to_reverse["dst"] = tmp + + self.edges = self.edges.append(to_reverse[["src", "dst", "type"]]) + + def _update_global_id(self): + orig_id = [] + graph_id = [] + prev_offset = 0 + + typed_node_id_maps = self.typed_id_map + + for type in self.g.ntypes: + from_id, to_id = zip(*typed_node_id_maps[type].items()) + orig_id.extend(from_id) + graph_id.extend([t + prev_offset for t in to_id]) + prev_offset += self.g.number_of_nodes(type) + + global_map = dict(zip(orig_id, graph_id)) + + self.nodes['global_graph_id'] = self.nodes['id'].apply( + lambda old_id: global_map[old_id]) + import torch + for ntype in self.g.ntypes: + self.g.nodes[ntype].data['global_graph_id'] = torch.LongTensor( + list( + map(lambda x: global_map[x], self.g.nodes[ntype].data['original_id'].tolist())) + ) + + self.node_id_to_global_id = dict( + zip(self.nodes["id"], self.nodes["global_graph_id"])) + + @property + def typed_node_counts(self): + + typed_node_counts = dict() + + unique_types = self.nodes['type'].unique() + + # node_types = dict(zip(self.node_types['int_type'], self.node_types['str_type'])) + + for type_id, type in enumerate(unique_types): + nodes_of_type = len(self.nodes.query(f"type == '{type}'")) + # typed_node_counts[node_types[type]] = nodes_of_type + typed_node_counts[type] = nodes_of_type + + return typed_node_counts + + def _create_hetero_graph(self): + + nodes = self.nodes.copy() + edges = self.edges.copy() + edges = self._add_node_types_to_edges(nodes, edges) + + typed_node_id = dict(zip(nodes['id'], nodes['typed_id'])) + + possible_edge_signatures = edges[['src_type', 'type', 'dst_type']].drop_duplicates( + ['src_type', 'type', 'dst_type'] + ) + + # node_types = dict(zip(self.node_types['int_type'], self.node_types['str_type'])) + # edge_types = dict(zip(self.edge_types['int_type'], self.edge_types['str_type'])) + + # typed_subgraphs is a dictionary with subset_signature as a key, + # the dictionary stores directed edge lists + typed_subgraphs = {} + + # node_mapper = lambda old_id: typed_node_id[old_id] + # for src_type, type, dst_type, src, dst in edges[['src_type', 'type', 'dst_type', "src", "dst"]].values: + # subgraph_signature = (src_type, type, dst_type) + # if subgraph_signature in typed_subgraphs: + # typed_subgraphs[subgraph_signature].add((node_mapper(src), node_mapper(dst))) + # else: + # typed_subgraphs[subgraph_signature] = {node_mapper(src), node_mapper(dst)} + + for ind, row in possible_edge_signatures.iterrows(): # + # subgraph_signature = (node_types[row['src_type']], edge_types[row['type']], node_types[row['dst_type']]) + subgraph_signature = ( + row['src_type'], row['type'], row['dst_type']) + + subset = edges.query( + f"src_type == '{row['src_type']}' and type == '{row['type']}' and dst_type == '{row['dst_type']}'" + ) + + typed_subgraphs[subgraph_signature] = list( + zip( + subset['src'].map(lambda old_id: typed_node_id[old_id]), + subset['dst'].map(lambda old_id: typed_node_id[old_id]) + ) + ) + + logging.info( + f"Unique triplet types in the graph: {len(typed_subgraphs.keys())}" + ) + + import dgl + import torch + self.g = dgl.heterograph(typed_subgraphs, self.typed_node_counts) + + # node_types = dict(zip(self.node_types['str_type'], self.node_types['int_type'])) + + for ntype in self.g.ntypes: + # int_type = node_types[ntype] + + node_data = self.nodes.query( + f"type == '{ntype}'" + )[[ + 'typed_id', 'train_mask', 'test_mask', 'val_mask', 'id' # 'compact_label', + ]].sort_values('typed_id') + + self.g.nodes[ntype].data['train_mask'] = torch.tensor( + node_data['train_mask'].values, dtype=torch.bool) + self.g.nodes[ntype].data['test_mask'] = torch.tensor( + node_data['test_mask'].values, dtype=torch.bool) + self.g.nodes[ntype].data['val_mask'] = torch.tensor( + node_data['val_mask'].values, dtype=torch.bool) + # self.g.nodes[ntype].data['labels'] = torch.tensor(node_data['compact_label'].values, dtype=torch.int64) + self.g.nodes[ntype].data['typed_id'] = torch.tensor( + node_data['typed_id'].values, dtype=torch.int64) + self.g.nodes[ntype].data['original_id'] = torch.tensor( + node_data['id'].values, dtype=torch.int64) + + @staticmethod + def _assess_need_for_self_loops(nodes, edges): + # this is a hack when where are only outgoing connections from this node type + need_self_loop = set(edges['src'].values.tolist()) - \ + set(edges['dst'].values.tolist()) + for nid in need_self_loop: + edges = edges.append({ + "id": -1, + "type": 99, + "src": nid, + "dst": nid + }, ignore_index=True) + + return nodes, edges + + @staticmethod + def holdout(nodes: pd.DataFrame, edges: pd.DataFrame, holdout_size=10000, random_seed=42): + """ + Create a set of holdout edges, ensure that there are no orphan nodes after these edges are removed. + :param nodes: + :param edges: + :param holdout_frac: + :param random_seed: + :return: + """ + + from collections import Counter + + degree_count = Counter(edges["src"].tolist()) | Counter( + edges["dst"].tolist()) + + heldout = [] + + edges = edges.reset_index(drop=True) + index = edges.index.to_numpy() + numpy.random.seed(random_seed) + numpy.random.shuffle(index) + + for i in index: + src_id = edges.loc[i].src + if degree_count[src_id] > 2: + heldout.append(edges.loc[i].id) + degree_count[src_id] -= 1 + if len(heldout) >= holdout_size: + break + + heldout = set(heldout) + + def is_held(id_): + return id_ in heldout + + train_edges = edges[ + edges["id"].apply(lambda id_: not is_held(id_)) + ] + + heldout_edges = edges[ + edges["id"].apply(is_held) + ] + + assert len(edges) == edges["id"].unique().size + + return nodes, train_edges, heldout_edges + + @staticmethod + def get_name_group(name): + parts = name.split("@") + if len(parts) == 1: + return pd.NA + elif len(parts) == 2: + local_name, group = parts + return group + return pd.NA + + @staticmethod + def create_train_val_test_masks(nodes, train_idx, val_idx, test_idx): + nodes['train_mask'] = True + # nodes.loc[train_idx, 'train_mask'] = True + nodes['val_mask'] = False + nodes.loc[val_idx, 'val_mask'] = True + nodes['test_mask'] = False + nodes.loc[test_idx, 'test_mask'] = True + nodes['train_mask'] = nodes['train_mask'] ^ ( + nodes['val_mask'] | nodes['test_mask']) + + def starts_with(x): return x.startswith("##node_type") + nodes.loc[ + nodes.eval("name.map(@starts_with)", local_dict={"starts_with": starts_with}), ['train_mask', 'val_mask', + 'test_mask']] = False + + @staticmethod + def get_train_val_test_indices(indices, train_frac=0.6, random_seed=None): + if random_seed is not None: + numpy.random.seed(random_seed) + logging.warning("Random state for splitting dataset is fixed") + else: + logging.info("Random state is not set") + + indices = indices.to_numpy() + + numpy.random.shuffle(indices) + + train = int(indices.size * train_frac) + test = int(indices.size * (train_frac + (1 - train_frac) / 2)) + + logging.info( + f"Splitting into train {train}, validation {test - train}, and test {indices.size - test} sets" + ) + + return indices[:train], indices[train: test], indices[test:] + + @staticmethod + def get_train_val_test_indices_on_packages(nodes, package_names, train_frac=0.6, random_seed=None): + if random_seed is not None: + numpy.random.seed(random_seed) + logging.warning("Random state for splitting dataset is fixed") + else: + logging.info("Random state is not set") + + nodes = nodes.copy() + + package_names = [name.replace("\n", "").replace( + "-", "_").replace(".", "_") for name in package_names] + + package_names = numpy.array(package_names) + numpy.random.shuffle(package_names) + + train = int(package_names.size * train_frac) + test = int(package_names.size * (train_frac + (1 - train_frac) / 2)) + + logging.info( + f"Splitting into train {train}, validation {test - train}, and test {package_names.size - test} packages" + ) + + train, valid, test = package_names[:train], package_names[train: test], package_names[test:] + + train = set(train.tolist()) + valid = set(valid.tolist()) + test = set(test.tolist()) + + nodes["node_package_names"] = nodes["name"].map( + lambda name: name.split(".")[0]) + + def get_split_indices(split): + global_types = {val for _, val in node_types.items()} + + def is_global_type(type): + return type in global_types + + def in_split(name): + return name in split + + global_nodes = nodes.query( + "node_package_names.map(@in_split) and type_backup.map(@is_global_type)", + local_dict={"in_split": in_split, + "is_global_type": is_global_type} + )["id"] + global_nodes = set(global_nodes.tolist()) + + def in_global(node_id): + return node_id in global_nodes + + ast_nodes = nodes.query( + "mentioned_in.map(@in_global)", local_dict={"in_global": in_global})["id"] + + split_nodes = global_nodes | set(ast_nodes.tolist()) + + def nodes_in_split(node_id): + return node_id in split_nodes + + return nodes.query("id.map(@nodes_in_split)", local_dict={"nodes_in_split": nodes_in_split}).index + + return get_split_indices(train), get_split_indices(valid), get_split_indices(test) + + @staticmethod + def get_global_edges(): + """ + :return: Set of global edges and their reverses + """ + from SourceCodeTools.code.data.sourcetrail.sourcetrail_types import special_mapping, node_types + types = set() + + for key, value in special_mapping.items(): + types.add(key) + types.add(value) + + for _, value in node_types.items(): + types.add(value + "_name") + + return types + + @staticmethod + def get_embeddable_name(name): + if "@" in name: + return name.split("@")[0] + elif "_0x" in name: + return name.split("_0x")[0] + else: + return name + + @staticmethod + def ensure_connectedness(nodes: pandas.DataFrame, edges: pandas.DataFrame): + """ + Filtering isolated nodes + :param nodes: DataFrame + :param edges: DataFrame + :return: + """ + + logging.info( + f"Filtering isolated nodes. " + f"Starting from {nodes.shape[0]} nodes and {edges.shape[0]} edges...", + ) + unique_nodes = set(edges['src'].append(edges['dst'])) + + nodes = nodes[ + nodes['id'].apply(lambda nid: nid in unique_nodes) + ] + + logging.info( + f"Ending up with {nodes.shape[0]} nodes and {edges.shape[0]} edges" + ) + + return nodes, edges + + @staticmethod + def ensure_valid_edges(nodes, edges, ignore_src=False): + """ + Filter edges that link to nodes that do not exist + :param nodes: + :param edges: + :param ignore_src: + :return: + """ + print( + f"Filtering edges to invalid nodes. " + f"Starting from {nodes.shape[0]} nodes and {edges.shape[0]} edges...", + end="" + ) + + unique_nodes = set(nodes['id'].values.tolist()) + + if not ignore_src: + edges = edges[ + edges['src'].apply(lambda nid: nid in unique_nodes) + ] + + edges = edges[ + edges['dst'].apply(lambda nid: nid in unique_nodes) + ] + + print( + f"ending up with {nodes.shape[0]} nodes and {edges.shape[0]} edges" + ) + + return nodes, edges + + # def mark_leaf_nodes(self): + # leaf_types = {'subword', "Op", "Constant", "Name"} # the last is used in graphs without subwords + # + # self.nodes['is_leaf'] = self.nodes['type_backup'].apply(lambda type_: type_ in leaf_types) + + # def get_typed_node_id(self, node_id, node_type): + # return self.typed_id_map[node_type][node_id] + # + # def get_global_node_id(self, node_id, node_type=None): + # return self.node_id_to_global_id[node_id] + + def load_node_names(self): + """ + :return: DataFrame that contains mappings from nodes to names that appear more than once in the graph + """ + for_training = self.nodes[ + self.nodes['train_mask'] | self.nodes['test_mask'] | self.nodes['val_mask'] + ][['id', 'type_backup', 'name']]\ + .rename({"name": "serialized_name", "type_backup": "type"}, axis=1) + + global_node_types = set(node_types.values()) + for_training = for_training[ + for_training["type"].apply(lambda x: x not in global_node_types) + ] + + node_names = extract_node_names(for_training, 2) + node_names = filter_dst_by_freq( + node_names, freq=self.min_count_for_objectives) + + return node_names + # path = join(self.data_path, "node_names.bz2") + # return unpersist(path) + + def load_subgraph_function_names(self): + names_path = os.path.join( + self.data_path, "common_name_mappings.json.bz2") + names = unpersist(names_path) + + fname2gname = dict(zip(names["ast_name"], names["proper_names"])) + + functions = self.nodes.query( + "id in @functions", local_dict={"functions": set(self.nodes["mentioned_in"])} + ).query("type_backup == 'FunctionDef'") + + functions["gname"] = functions["name"].apply( + lambda x: fname2gname.get(x, pd.NA)) + functions = functions.dropna(axis=0) + functions["gname"] = functions["gname"].apply( + lambda x: x.split(".")[-1]) + + return functions.rename({"id": "src", "gname": "dst"}, axis=1)[["src", "dst"]] + + def load_var_use(self): + """ + :return: DataFrame that contains mapping from function ids to variable names that appear in those functions + """ + path = join(self.data_path, "common_function_variable_pairs.json.bz2") + var_use = unpersist(path) + var_use = filter_dst_by_freq( + var_use, freq=self.min_count_for_objectives) + return var_use + + def load_api_call(self): + path = join(self.data_path, "common_call_seq.json.bz2") + api_call = unpersist(path) + api_call = filter_dst_by_freq( + api_call, freq=self.min_count_for_objectives) + return api_call + + def load_token_prediction(self): + """ + Return names for all nodes that represent local mentions + :return: DataFrame that contains mappings from local mentions to names that these mentions represent. Applies + only to nodes that have subwords and have appeared in a scope (names have `@` in their names) + """ + if self.use_edge_types: + edges = self.edges.query("type == 'subword_'") + else: + edges = self.edges.query("type_backup == 'subword_'") + + target_nodes = set(edges["dst"].to_list()) + target_nodes = self.nodes.query( + "id in @target_nodes", local_dict={"target_nodes": target_nodes})[["id", "name"]] + + def name_extr(x): return x.split('@')[0] + # target_nodes.eval("group = name.map(@get_group)", local_dict={"get_group": get_name_group}, inplace=True) + # target_nodes.dropna(axis=0, inplace=True) + target_nodes.eval("name = name.map(@name_extr)", + local_dict={"name_extr": name_extr}, inplace=True) + target_nodes.rename({"id": "src", "name": "dst"}, axis=1, inplace=True) + target_nodes = filter_dst_by_freq( + target_nodes, freq=self.min_count_for_objectives) + # target_nodes.eval("cooccurr = dst.map(@occ)", local_dict={"occ": lambda name: name_cooccurr_freq.get(name, Counter())}, inplace=True) + + return target_nodes + + def load_global_edges_prediction(self): + + nodes_path = join(self.data_path, "common_nodes.json.bz2") + edges_path = join(self.data_path, "common_edges.json.bz2") + + _, edges = load_data(nodes_path, edges_path) + + global_edges = self.get_global_edges() + # these edges are already in AST? + global_edges = global_edges - {"defines", "defined_in"} + global_edges.add("global_mention") + + def is_global(type): return type in global_edges + edges = edges.query("type.map(@is_global)", + local_dict={"is_global": is_global}) + + edges.rename( + { + "source_node_id": "src", + "target_node_id": "dst" + }, inplace=True, axis=1 + ) + + return edges[["src", "dst"]] + + def load_edge_prediction(self): + + nodes_path = join(self.data_path, "common_nodes.json.bz2") + edges_path = join(self.data_path, "common_edges.json.bz2") + + _, edges = load_data(nodes_path, edges_path) + + edges.rename( + { + "source_node_id": "src", + "target_node_id": "dst" + }, inplace=True, axis=1 + ) + + global_edges = {"global_mention", "subword", "next", "prev"} + global_edges = global_edges | { + "mention_scope", "defined_in_module", "defined_in_class", "defined_in_function"} + + if self.no_global_edges: + global_edges = global_edges | self.get_global_edges() + + global_edges = global_edges | set( + edge + "_rev" for edge in global_edges) + + def is_ast(type): return type not in global_edges + edges = edges.query("type.map(@is_ast)", local_dict={"is_ast": is_ast}) + edges = edges[edges["type"].apply( + lambda type_: not type_.endswith("_rev"))] + + valid_nodes = set(edges["src"].tolist()) + valid_nodes = valid_nodes.intersection(set(edges["dst"].tolist())) + + # if self.use_ns_groups: + # groups = self.get_negative_sample_groups() + # valid_nodes = valid_nodes.intersection(set(groups["id"].tolist())) + + edges = edges[ + edges["src"].apply(lambda id_: id_ in valid_nodes) + ] + edges = edges[ + edges["dst"].apply(lambda id_: id_ in valid_nodes) + ] + + return edges[["src", "dst", "type"]] + + def load_type_prediction(self): + + type_ann = unpersist(join(self.data_path, "type_annotations.json.bz2")) + + def filter_rule(name): return "0x" not in name + + type_ann = type_ann[ + type_ann["dst"].apply(filter_rule) + ] + + node2id = dict(zip(self.nodes["id"], self.nodes["type_backup"])) + type_ann = type_ann[ + type_ann["src"].apply(lambda id_: id_ in node2id) + ] + + type_ann["src_type"] = type_ann["src"].apply(lambda x: node2id[x]) + + type_ann = type_ann[ + # FunctionDef {"arg", "AnnAssign"}) + type_ann["src_type"].apply(lambda type_: type_ in {"mention"}) + ] + + def norm(x): return x.strip("\"").strip( + "'").split("[")[0].split(".")[-1] + + type_ann["dst"] = type_ann["dst"].apply(norm) + type_ann = filter_dst_by_freq(type_ann, self.min_count_for_objectives) + type_ann = type_ann[["src", "dst"]] + + return type_ann + + def load_cubert_subgraph_labels(self): + + filecontent = unpersist( + join(self.data_path, "common_filecontent.json.bz2")) + return filecontent[["id", "label"]].rename({"id": "src", "label": "dst"}, axis=1) + + def load_scaa_subgraph_labels(self): + + filecontent = unpersist( + join(self.data_path, "common_filecontent.json.bz2")) + return filecontent[["id", "user"]].rename({"id": "src", "user": "dst"}, axis=1) + + def load_docstring(self): + + docstrings_path = os.path.join( + self.data_path, "common_source_graph_bodies.json.bz2") + + dosctrings = unpersist(docstrings_path)[["id", "docstring"]] + + from nltk import sent_tokenize + + def normalize(text): + if text is None or len(text.strip()) == 0: + return pd.NA + return "\n".join(sent_tokenize(text)[:3]).replace("\n", " ") + + dosctrings.eval("docstring = docstring.map(@normalize)", + local_dict={"normalize": normalize}, inplace=True) + dosctrings.dropna(axis=0, inplace=True) + + dosctrings.rename({ + "id": "src", + "docstring": "dst" + }, axis=1, inplace=True) + + return dosctrings + + def load_node_classes(self): + have_inbound = set(self.edges["dst"].tolist()) + labels = self.nodes.query("train_mask == True or test_mask == True or val_mask == True")[["id", "type_backup"]].rename({ + "id": "src", + "type_backup": "dst" + }, axis=1) + + labels = labels[ + labels["src"].apply(lambda id_: id_ in have_inbound) + ] + return labels + + def buckets_from_pretrained_embeddings(self, pretrained_path, n_buckets): + + from SourceCodeTools.nlp.embed.fasttext import load_w2v_map + from SourceCodeTools.nlp import token_hasher + pretrained = load_w2v_map(pretrained_path) + + import numpy as np + + embs_init = np.random.randn( + n_buckets, pretrained.n_dims).astype(np.float32) + + for word in pretrained.keys(): + ind = token_hasher(word, n_buckets) + embs_init[ind, :] = pretrained[word] + + def op_embedding(op_tokens): + embedding = None + for token in op_tokens: + token_emb = pretrained.get(token, None) + if embedding is None: + embedding = token_emb + else: + embedding = embedding + token_emb + return embedding + + python_ops_to_bpe = self._op_tokens() + for op, op_tokens in python_ops_to_bpe.items(): + op_emb = op_embedding(op_tokens) + if op_emb is not None: + op_ind = token_hasher(op, n_buckets) + embs_init[op_ind, :] = op_emb + + return embs_init + + def create_subword_masker(self): + """ + :return: SubwordMasker for all nodes that have subwords. Suitable for token prediction objective. + """ + return SubwordMasker(self.nodes, self.edges) + + def create_variable_name_masker(self, tokenizer_path): + """ + :param tokenizer_path: path to bpe tokenizer + :return: SubwordMasker for function nodes. Suitable for variable name use prediction objective + """ + return NodeNameMasker(self.nodes, self.edges, self.load_var_use(), tokenizer_path) + + def create_node_name_masker(self, tokenizer_path): + """ + :param tokenizer_path: path to bpe tokenizer + :return: SubwordMasker for function nodes. Suitable for node name use prediction objective + """ + return NodeNameMasker(self.nodes, self.edges, self.load_node_names(), tokenizer_path) + + def create_node_clf_masker(self): + """ + :param tokenizer_path: path to bpe tokenizer + :return: SubwordMasker for function nodes. Suitable for node name use prediction objective + """ + return NodeClfMasker(self.nodes, self.edges) + + def get_negative_sample_groups(self): + return self.nodes[["id", "mentioned_in"]].dropna(axis=0) + + @property + def subgraph_mapping(self): + assert self.subgraph_id_column is not None, "`subgraph_id_column` was not provided" + + id2type = dict(zip(self.nodes["id"], self.nodes["type"])) + + subgraph_mapping = dict() + + def add_item(subgraph_dict, node_id): + type_ = id2type[node_id] + + if type_ not in subgraph_dict: + subgraph_dict[type_] = set() + + subgraph_dict[type_].add(self.typed_id_map[type_][node_id]) + + for src, dst, subgraph_id in self.edges[["src", "dst", self.subgraph_id_column]].values: + if subgraph_id not in subgraph_mapping: + subgraph_mapping[subgraph_id] = dict() + + if not pandas.isna(subgraph_id): + subgraph_dict = subgraph_mapping[subgraph_id] + add_item(subgraph_dict, src) + add_item(subgraph_dict, dst) + + for subgraph_id, subgraph_dict in subgraph_mapping.items(): + for type_ in subgraph_dict: + subgraph_dict[type_] = list(subgraph_dict[type_]) + + class SubgraphMapping: + def __init__(self, subgraph_mapping): + self._subgraph_mapping = subgraph_mapping + + def __getitem__(self, item): + return self._subgraph_mapping[item] + + def keys(self): + return self._subgraph_mapping.keys() + + return SubgraphMapping(subgraph_mapping) + + @classmethod + def load(cls, path, args): + dataset = pickle.load(open(path, "rb")) + dataset.data_path = args["data_path"] + if dataset.tokenizer_path is not None: + dataset.tokenizer_path = args["tokenizer"] + return dataset + + +def read_or_create_gnn_dataset(args, model_base, force_new=False, restore_state=False): + if restore_state and not force_new: + # i'm not happy with this behaviour that differs based on the flag status + dataset = SourceGraphDataset.load( + join(model_base, "dataset.pkl"), args) + else: + dataset = SourceGraphDataset(**args) + + # save dataset state for recovery + pickle.dump(dataset, open(join(model_base, "dataset.pkl"), "wb")) + + return dataset + + +def test_dataset(): + import sys + + data_path = sys.argv[1] + # nodes_path = sys.argv[1] + # edges_path = sys.argv[2] + + dataset = SourceGraphDataset( + data_path, + # nodes_path, edges_path, + use_node_types=False, + use_edge_types=True, + ) + + # sm = dataset.create_subword_masker() + print(dataset) + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, format="%(levelname)s:%(module)s:%(lineno)d:%(message)s") + test_dataset() diff --git a/SourceCodeTools/code/data/sourcetrail/SubwordMasker.py b/SourceCodeTools/code/data/dataset/SubwordMasker.py similarity index 87% rename from SourceCodeTools/code/data/sourcetrail/SubwordMasker.py rename to SourceCodeTools/code/data/dataset/SubwordMasker.py index 89607ad3..509ac997 100644 --- a/SourceCodeTools/code/data/sourcetrail/SubwordMasker.py +++ b/SourceCodeTools/code/data/dataset/SubwordMasker.py @@ -103,4 +103,25 @@ def instantiate(self, nodes, orig_edges, **kwargs): self.lookup[key].extend([subword2key[sub] for sub in subwords if sub in subword2key]) +class NodeClfMasker(SubwordMasker): + """ + Masker that tells which node ids are subwords for variables mentioned in a given function. + """ + def __init__(self, nodes: pd.DataFrame, edges: pd.DataFrame, **kwargs): + super(NodeClfMasker, self).__init__(nodes, edges, **kwargs) + + def instantiate(self, nodes, orig_edges, **kwargs): + pass + + def get_mask(self, ids): + """ + Accepts node ids that represent embeddable tokens as an input + :param ids: + :return: + """ + if isinstance(ids, dict): + for_masking = ids + else: + for_masking = {"node_": ids} + return for_masking diff --git a/SourceCodeTools/code/data/dataset/__init__.py b/SourceCodeTools/code/data/dataset/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/SourceCodeTools/code/data/dataset/reader.py b/SourceCodeTools/code/data/dataset/reader.py new file mode 100644 index 00000000..05e652ca --- /dev/null +++ b/SourceCodeTools/code/data/dataset/reader.py @@ -0,0 +1,65 @@ +from pathlib import Path + +from SourceCodeTools.code.common import read_nodes, read_edges +from SourceCodeTools.code.data.file_utils import unpersist +from SourceCodeTools.code.annotator_utils import source_code_graph_alignment + + +def load_data(node_path, edge_path, rename_columns=True): + nodes = read_nodes(node_path) + edges = read_edges(edge_path) + + if rename_columns: + nodes = nodes.rename(mapper={ + 'serialized_name': 'name' + }, axis=1) + edges = edges.rename(mapper={ + 'source_node_id': 'src', + 'target_node_id': 'dst' + }, axis=1) + + return nodes, edges + + +def load_graph(dataset_directory, rename_columns=True): + dataset_path = Path(dataset_directory) + nodes = dataset_path.joinpath("common_nodes.bz2") + edges = dataset_path.joinpath("common_edges.bz2") + + return load_data(nodes, edges, rename_columns=rename_columns) + + +def load_aligned_source_code(dataset_directory, tokenizer="codebert"): + dataset_path = Path(dataset_directory) + + files = unpersist(dataset_path.joinpath("common_filecontent.bz2")).rename({"id": "file_id"}, axis=1) + + content = dict(zip(zip(files["package"], files["file_id"]), files["filecontent"])) + pd_offsets = unpersist(dataset_path.joinpath("common_offsets.bz2")) + + seen = set() + + source_codes = [] + offsets = [] + + for group, data in pd_offsets.groupby(by=["package", "file_id"]): + source_codes.append(content[group]) + offsets.append(list(zip(data["start"], data["end"], data["node_id"]))) + seen.add(group) + + for key, val in content.items(): + if key not in seen: + source_codes.append(val) + offsets.append([]) + + return source_code_graph_alignment(source_codes, offsets, tokenizer=tokenizer) + + +if __name__ == "__main__": + import sys + data_path = sys.argv[1] + for tokens, node_tags in load_aligned_source_code(data_path): + for t, tt in zip(tokens, node_tags): + print(t, tt, sep="\t") + print() + diff --git a/SourceCodeTools/code/data/sourcetrail/file_utils.py b/SourceCodeTools/code/data/file_utils.py similarity index 53% rename from SourceCodeTools/code/data/sourcetrail/file_utils.py rename to SourceCodeTools/code/data/file_utils.py index 05287e26..6f25cc4f 100644 --- a/SourceCodeTools/code/data/sourcetrail/file_utils.py +++ b/SourceCodeTools/code/data/file_utils.py @@ -1,5 +1,11 @@ +import bz2 import logging +import tempfile from csv import QUOTE_NONNUMERIC +from pathlib import Path +from typing import Union + +import numpy as np import pandas as pd import os @@ -44,6 +50,70 @@ def read_parquet(path, **kwargs): return pd.read_parquet(path, **kwargs) +def write_json(df, path, **kwargs): + if "mode" in kwargs: + mode = kwargs.pop("mode") + else: + mode = None + + if mode == "a": + with open(path, "a") as sink: + sink.write("\n") + sink.write(df.to_json(path_or_buf=None, orient="records", lines=True, **kwargs)) + else: + df.to_json(path_or_buf=path, orient="records", lines=True, **kwargs) + + +def read_json_with_generator(path, chunksize, **kwargs): + if str(path).endswith(".bz2"): + source = bz2.open(path, mode="rt") + else: + source = open(path, "r") + # need to read manually, probably a bug in pandas + buffer = [] + last_index = 0 + + def prepare_chunk(buffer, last_index): + chunk = pd.read_json("".join(buffer), orient="records", lines=True, **kwargs) + chunk.index = np.array(list(range(last_index, last_index + len(chunk)))) + last_index = last_index + len(chunk) + return chunk, last_index + + for ind, line in enumerate(source): + buffer.append(line) + if len(buffer) >= chunksize: + chunk, last_index = prepare_chunk(buffer, last_index) + # chunk = pd.read_json("".join(buffer), orient="records", lines=True, **kwargs) + # chunk.index = np.array(list(range(last_index, last_index + len(chunk)))) + # last_index = last_index + len(chunk) + yield chunk + buffer.clear() + if len(buffer) != 0: + chunk, last_index = prepare_chunk(buffer, last_index) + # chunk = pd.read_json("".join(buffer), orient="records", lines=True, **kwargs) + # chunk.index = np.array(list(range(last_index, last_index + len(chunk)))) + # last_index = last_index + len(chunk) + yield chunk + + +def _grow_with_chunks(chunks): + table = None + for chunk in chunks: + if table is None: + table = chunk + else: + table = pd.concat([table, chunk], copy=False) + return table + + +def read_json(path, **kwargs): + if "chunksize" in kwargs: + return read_json_with_generator(path, kwargs.pop("chunksize"), **kwargs) + else: + return _grow_with_chunks(read_json_with_generator(path, 100000, **kwargs)) + # return pd.read_json(path, orient="records", lines=True, **kwargs) + + def read_source_location(base_path): source_location_path = os.path.join(base_path, filenames["source_location"]) @@ -138,26 +208,58 @@ def write_processed_bodies(df, base_path): persist(df, bodies_path) -def persist(df: pd.DataFrame, path: str, **kwargs): - if path.endswith(".csv"): +def likely_format(path): + path = Path(path) + name_parts = path.name.split(".") + if len(name_parts) == 1: + raise ValueError("Extension is not found for the file:", str(path)) + + extensions = "." + ".".join(name_parts[1:]) + + if ".csv" in extensions or ".tsv" in extensions: + ext = "csv" + elif ".json" in extensions: + ext = "json" + elif ".pkl" in extensions or extensions.endswith(".bz2"): + ext = "pkl" + elif ".parquet" in extensions: + ext = "parquet" + else: + raise NotImplementedError("supported extensions: csv, bz2, pkl, parquet, json", extensions) + + return ext + + +def persist(df: pd.DataFrame, path: Union[str, Path, bytes], **kwargs): + if isinstance(path, Path): + path = str(path.absolute()) + + format = likely_format(path) + if format == "csv": write_csv(df, path, **kwargs) - elif path.endswith(".pkl") or path.endswith(".bz2"): + elif format == "pkl": write_pickle(df, path, **kwargs) - elif path.endswith(".parquet"): + elif format == "parquet": write_parquet(df, path, **kwargs) - else: - raise NotImplementedError("supported extensions: csv, bz2, pkl, parquet") + elif format == "json": + write_json(df, path, **kwargs) + +def unpersist(path: Union[str, Path, bytes], **kwargs) -> pd.DataFrame: + if isinstance(path, Path): + path = str(path.absolute()) -def unpersist(path: str, **kwargs) -> pd.DataFrame: - if path.endswith(".csv"): + format = likely_format(path) + if format == "csv": data = read_csv(path, **kwargs) - elif path.endswith(".pkl") or path.endswith(".bz2"): + elif format == "pkl": data = read_pickle(path, **kwargs) - elif path.endswith(".parquet"): + elif format == "parquet": data = read_parquet(path, **kwargs) + elif format == "json": + data = read_json(path, **kwargs) else: - raise NotImplementedError("supported extensions: csv, bz2, pkl, parquet") + data = None return data @@ -178,3 +280,19 @@ def unpersist_or_exit(path, exit_message=None, **kwargs): sys.exit() else: return data + + +def get_random_name(length=10): + char_ranges = [chr(i) for i in range(ord("a"), ord("a")+26)] + \ + [chr(i) for i in range(ord("A"), ord("A")+26)] + \ + [chr(i) for i in range(ord("0"), ord("0")+10)] + from random import sample + return "".join(sample(char_ranges, k=length)) + +def get_temporary_filename(): + tmp_dir = tempfile.gettempdir() + name_generator = tempfile._get_candidate_names() + path = os.path.join(tmp_dir, next(name_generator)) + while os.path.isdir(path): + path = os.path.join(tmp_dir, next(name_generator)) + return path \ No newline at end of file diff --git a/SourceCodeTools/code/data/py_150/create_download_links.py b/SourceCodeTools/code/data/py_150/create_download_links.py new file mode 100644 index 00000000..63344569 --- /dev/null +++ b/SourceCodeTools/code/data/py_150/create_download_links.py @@ -0,0 +1,22 @@ +import sys + +def parse_line(line): + hash, repo_link = line.split("\t") + parts = repo_link.split("/") + user = parts[3] + repo = parts[4] + return user, repo, hash + +def main(): + # download_link = "https://codeload.github.com/{user}/{repo}/zip/{hash}" + download_link = "https://codeload.github.com/{}/{}/zip/{}" + + for line in sys.stdin: + line = line.strip() + if line == "": + continue + + print(download_link.format(*parse_line(line))) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/SourceCodeTools/code/data/py_150/download.sh b/SourceCodeTools/code/data/py_150/download.sh new file mode 100644 index 00000000..bbd77dc8 --- /dev/null +++ b/SourceCodeTools/code/data/py_150/download.sh @@ -0,0 +1,7 @@ +while read p; do + reponame=$(echo $p | awk -F"/" '{print $5}') + echo $reponame, $p + wget -O "$reponame.zip" $p + echo "Waiting 2s..." + sleep 2s +done \ No newline at end of file diff --git a/SourceCodeTools/code/data/scaa/partitioning.py b/SourceCodeTools/code/data/scaa/partitioning.py new file mode 100644 index 00000000..dfe35bc2 --- /dev/null +++ b/SourceCodeTools/code/data/scaa/partitioning.py @@ -0,0 +1,80 @@ +from functools import partial +from os.path import join +from random import random + +import numpy as np + +from SourceCodeTools.code.common import read_edges, read_nodes +from SourceCodeTools.code.data.file_utils import persist, unpersist + + +def add_splits(items, train_frac, restricted_id_pool=None): + items = items.copy() + + def random_partition(): + r = random() + if r < train_frac: + return "train" + elif r < train_frac + (1 - train_frac) / 2: + return "val" + else: + return "test" + + import numpy as np + # define partitioning + masks = np.array([random_partition() for _ in range(len(items))]) + + # create masks + items["train_mask"] = masks == "train" + items["val_mask"] = masks == "val" + items["test_mask"] = masks == "test" + + if restricted_id_pool is not None: + # if `restricted_id_pool` is provided, mask all nodes not in `restricted_id_pool` negatively + to_keep = items.eval("id in @restricted_ids", + local_dict={"restricted_ids": restricted_id_pool}) + items["train_mask"] = items["train_mask"] & to_keep + items["test_mask"] = items["test_mask"] & to_keep + items["val_mask"] = items["val_mask"] & to_keep + + return items + + +def subgraph_partitioning(path_to_dataset, partition_column, train_frac=0.7): + + get_path = partial(join, path_to_dataset) + + # nodes = read_nodes(get_path("common_nodes.json.bz2")) + edges = read_edges(get_path("common_edges.json.bz2")) + filecontent = unpersist(get_path("common_filecontent.json.bz2")) + + def task_partition(task): + # 9 = 1 + 1 + 5 + num = random() + if num < 0.8: + return "train" + elif task < 0.9: + return "val" + else: + return "test" + + task2split = dict(zip(filecontent[partition_column], [ + task_partition(task) for task in filecontent[partition_column]])) + file_id2split = np.array([task2split[task] + for task in filecontent[partition_column]]) + + subgraph_ids = filecontent[["id"]] + + subgraph_ids["train_mask"] = file_id2split == "train" + subgraph_ids["val_mask"] = file_id2split == "val" + subgraph_ids["test_mask"] = file_id2split == "test" + + persist(subgraph_ids, get_path("subgraph_partition.json")) + + +if __name__ == "__main__": + # not scalable, but works + path = './examples/task/with_ast' + column = 'id' + subgraph_partitioning(path, column) + print('Done') diff --git a/SourceCodeTools/code/data/scaa/prepare_for_ast_parser.py b/SourceCodeTools/code/data/scaa/prepare_for_ast_parser.py new file mode 100644 index 00000000..fffde9f0 --- /dev/null +++ b/SourceCodeTools/code/data/scaa/prepare_for_ast_parser.py @@ -0,0 +1,342 @@ +import hashlib + +# import pandas as pd +import ast +import json +from pathlib import Path + +import pandas as pd +from nltk import RegexpTokenizer +from itertools import chain + +from tqdm import tqdm + +from SourceCodeTools.code.data.cubert_python_benchmarks.SQLTable import SQLTable + + +class CodeTokenizer: + tok = RegexpTokenizer("\w+|\s+|\W+") + + @classmethod + def tokenize(cls, code): + return cls.tok.tokenize(code) + + @classmethod + def detokenize(cls, code_tokens): + return "".join(code_tokens) + + +def test_CodeTokenizer(): + code = """ +data = pd.DataFrame2.from_records(json.loads(line) for line in open(filename).readlines()) + +success = 0 +errors = 0 +""" + assert CodeTokenizer.detokenize(CodeTokenizer.tokenize(code)) == code + + +test_CodeTokenizer() + + +class Info: + """ + Example + dataset/ETHPy150Open EricssonResearch/calvin-base/calvin/requests/request_handler.py RequestHandler.get_index/VarMisuse@32/36 `async`->`self` + """ + def __init__(self, info): + self.info = info + parts = info.split(" ") + self.parse_filepath(parts) + self.parse_fn_path(parts) + self.parse_state(parts) + self.parse_package(parts) + self.set_id() + + def parse_filepath(self, parts): + self.filepath = " ".join(self.info.split(".py")[0].split(" ")[1:]) + ".py" + # self.filepath = parts[1].split(".py")[0] + ".py" + + def parse_fn_path(self, parts): + self.fn_path = self.filepath + "/" + self.info.split(".py")[1].lstrip("/").lstrip(" ").split("/")[0] + + def parse_state(self, parts): + self.state = "/".join(self.info.split(".py")[1].lstrip("/").lstrip(" ").split("/")[1:]) + + def parse_package(self, parts): + self.package = self.filepath.split("/")[0] + + def set_id(self): + identifier = "\t".join([self.fn_path, self.state]) + self.id = hashlib.md5(identifier.encode('utf-8')).hexdigest() + + oidentifier = "\t".join([self.fn_path, "original"]) + + if self.state.endswith("riginal"): + assert identifier == oidentifier + self.original_id = hashlib.md5(oidentifier.encode('utf-8')).hexdigest() + + def __repr__(self): + return self.info + + +class DatasetAdapter: + replacements = { + "async": "async_", + "await": "await_" + } + + fields = { + "python_tasks": ["flines", "task" , "user", "year"], + } + # supported_partitions = ["train", "dev", "eval"] # + + benchmark_names = { + "python_tasks": "scaa_python.csv", + } + + preferred_column_order = ["id", "package", "function", "info", "label", "partition"] + + import_order = [ + "python_tasks", + ] + + def __init__(self, dataset_location): + self.dataset_location = Path(dataset_location) + + self.replacement_fns = { + "flines": self.fix_code_if_needed, + } + + self.preprocess = { + # "variable_misuse_repair": { + # "function": self.cubert_detokenize + # } + } + + self.extra_fields = { + # "info": [("package", self.get_package)], + # "provenance": [("info", self.fix_info_if_needed)] + } + + # self.db = SQLTable(self.dataset_location.joinpath("cubert_benchmarks.db")) + + # def load_original_functions(self): + # functions = self.db.query("SELECT DISTINCT original_id, function FROM functions where comment = 'original' AND dataset = 'variable_misuse'") + # self.original_functions = dict(zip(functions["original_id"], functions["function"])) + + def prepare_misuse_repair_record(self, record): + print(record) + + @staticmethod + def get_source_from_ast_range(node, function, strip=True): + lines = function.split("\n") + start_line = node.lineno + end_line = node.end_lineno + start_col = node.col_offset + end_col = node.end_col_offset + + source = "" + num_lines = end_line - start_line + 1 + if start_line == end_line: + section = lines[start_line - 1].encode("utf8")[start_col:end_col].decode( + "utf8") + source += section.strip() if strip else section + "\n" + else: + for ind, lineno in enumerate(range(start_line - 1, end_line)): + if ind == 0: + section = lines[lineno].encode("utf8")[start_col:].decode( + "utf8") + source += section.strip() if strip else section + "\n" + elif ind == num_lines - 1: + section = lines[lineno].encode("utf8")[:end_col].decode( + "utf8") + source += section.strip() if strip else section + "\n" + else: + section = lines[lineno] + source += section.strip() if strip else section + "\n" + + return source.rstrip() + + def get_dispatch(self, function): + root = ast.parse(function) + return self.get_source_from_ast_range(root.body[0].decorator_list[0], function) + + @staticmethod + def remove_indent(code): + lines = code.strip("\n").split("\n") + first_line_indent = lines[0][:len(lines[0]) - len(lines[0].lstrip())] + start_char = len(first_line_indent) + if start_char != 0: + clean = "\n".join(line[start_char:] if line.startswith(first_line_indent) else line for line in lines) + else: + if lines[0].lstrip().startswith("@"): + for ind, line in enumerate(lines): + stripped = line.lstrip() + if stripped.startswith("def "): + lines[ind] = stripped + break + clean = "\n".join(lines) + return clean + + @classmethod + def fix_code_if_needed(cls, code): + # f = code.lstrip() + f = cls.remove_indent(code) + try: + ast.parse(f) + except Exception as e: + tokens = CodeTokenizer.tokenize(f) + recovered_tokens = [cls.replacements[token] if token in cls.replacements else token for token in tokens] + f = CodeTokenizer.detokenize(recovered_tokens) + ast.parse(f) + return f + + @classmethod + def fix_info_if_needed(cls, info): + if not info.endswith("original"): + parts = info.split(" ") + variable_replacement = parts[-1] + + variable_replacement = CodeTokenizer.detokenize( + cls.replacements[token] if token in cls.replacements else token + for token in CodeTokenizer.tokenize(variable_replacement) + ) + + parts[-1] = variable_replacement + + info = " ".join(parts) + return info + + @staticmethod + def get_package(info): + return info.split(" ")[1].split("/")[0] + + @classmethod + def sort_columns(cls, columns): + return sorted(columns, key=cls.preferred_column_order.index) + + def process_record(self, record, preprocess_fns): + new_record = {} + for field, data in record.items(): + if preprocess_fns is not None and field in preprocess_fns: + record[field] = preprocess_fns[field](data) + + new_record[field] = data if field not in self.replacement_fns else self.replacement_fns[field](" " + data) + + for new_field, new_field_fn in self.extra_fields.get(field, []): + new_record[new_field] = new_field_fn(data) + + return new_record + + @classmethod + def stream_original_partition(cls, file, *args, **kwargs): + data = pd.read_csv(file) + for record in data.to_dict(orient="records"): + yield record + + def stream_processed_partition(self, file, preprocess_fns=None): + for record in self.stream_original_partition(file): + try: + r = self.process_record(record, preprocess_fns=preprocess_fns) + r["parsing_error"] = None + except Exception as e: + r = record + r["parsing_error"] = e.msg if hasattr(e, "msg") else e.__class__.__name__ + # except MemoryError: # there are two functions that cause this error + # continue + # except SyntaxError as e: + # continue + yield r + + # @classmethod + # def process_dataset(cls, original_data_location, output_location): + # partitions = ["train", "dev", "eval"] + # + # last_id = 0 + # column_order = None + # + # data = pd.DataFrame.from_records( + # chain( + # *(cls.stream_processed_partition(original_data_location, partition, add_partition=True) for partition in + # partitions)), + # # columns = column_order + # ) + # data["id"] = range(len(data)) + # # data.to_pickle(output_location, index=False, columns=cls.sort_columns(data.columns)) + # data.to_pickle(output_location) + + def iterate_dataset(self, dataset_name): + for record in self.stream_processed_partition( + self.dataset_location.joinpath(self.benchmark_names[dataset_name]), + preprocess_fns=self.preprocess.get(dataset_name, None) + ): + yield record + + def import_data(self): + + functions = [] + + # added_original = set() + + for dataset_name in self.import_order: + + parsed_successfully = 0 + parsed_with_errors = 0 + + dataset_file = open(self.dataset_location.joinpath(f"{dataset_name}.jsonl"), "w") + + for record in tqdm(self.iterate_dataset(dataset_name), desc=f"Processing {dataset_name}"): + record_for_writing = { + "id": record["id"], + "function": record["flines"], + "user": record["user"], + "task": record["task"], + "year": record["year"], + "package": record["user"], + "parsing_error": record["parsing_error"] + } + + if record["parsing_error"] is None: + parsed_successfully += 1 + dataset_file.write(f"{json.dumps(record_for_writing)}\n") + else: + parsed_with_errors += 1 + + dataset_file.close() + print(f"{dataset_name}: success {parsed_successfully} error {parsed_with_errors}") + # functions.append(record_for_writing) + # + # if len(functions) > 100000: + # self.db.add_records(pd.DataFrame.from_records(functions), "functions") + # functions.clear() + # + # if len(functions) > 0: + # self.db.add_records(pd.DataFrame.from_records(functions), "functions") + # functions.clear() + + + +def test_fix_info_if_needed(): + example_info = "dataset/ETHPy150Open EricssonResearch/calvin-base/calvin/requests/request_handler.py RequestHandler.get_index/VarMisuse@32/36 `async`->`self`" + assert DatasetAdapter.fix_info_if_needed( + example_info) == "dataset/ETHPy150Open EricssonResearch/calvin-base/calvin/requests/request_handler.py RequestHandler.get_index/VarMisuse@32/36 `async_`->`self`" + + example_info = "dataset/ETHPy150Open EricssonResearch/calvin-base/calvin/requests/request_handler.py RequestHandler.get_index/VarMisuse@32/36 `async_call`->`self`" + assert DatasetAdapter.fix_info_if_needed( + example_info) == "dataset/ETHPy150Open EricssonResearch/calvin-base/calvin/requests/request_handler.py RequestHandler.get_index/VarMisuse@32/36 `async_call`->`self`" + + +test_fix_info_if_needed() + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser("Convert CuBERT's variable misuse detection dataset for further processing") + parser.add_argument("dataset_path", help="Path to dataset folder") + # parser.add_argument("output_path", help="Path to output file") + args = parser.parse_args() + + dataset = DatasetAdapter(args.dataset_path) + dataset.import_data() + # DatasetAdapter.process_dataset(args.dataset_path, args.output_path) \ No newline at end of file diff --git a/SourceCodeTools/code/data/sourcetrail/Dataset.py b/SourceCodeTools/code/data/sourcetrail/Dataset.py deleted file mode 100644 index 56f3de34..00000000 --- a/SourceCodeTools/code/data/sourcetrail/Dataset.py +++ /dev/null @@ -1,894 +0,0 @@ -# from collections import Counter -# from itertools import chain -from collections import Counter - -import pandas -import numpy -import pickle - -from os.path import join - -from SourceCodeTools.code.data.sourcetrail.SubwordMasker import SubwordMasker, NodeNameMasker -from SourceCodeTools.code.data.sourcetrail.file_utils import * -from SourceCodeTools.code.python_ast import PythonSharedNodes -from SourceCodeTools.nlp.embed.bpe import make_tokenizer, load_bpe_model -from SourceCodeTools.tabular.common import compact_property -from SourceCodeTools.code.data.sourcetrail.sourcetrail_types import node_types -from SourceCodeTools.code.data.sourcetrail.sourcetrail_extract_node_names import extract_node_names - - -def load_data(node_path, edge_path): - nodes = unpersist(node_path) - edges = unpersist(edge_path) - - nodes_ = nodes.rename(mapper={ - 'serialized_name': 'name' - }, axis=1).astype({ - 'type': 'category' - }) - - edges_ = edges.rename(mapper={ - 'source_node_id': 'src', - 'target_node_id': 'dst' - }, axis=1).astype({ - 'type': 'category' - }) - - return nodes_, edges_ - - -def get_name_group(name): - parts = name.split("@") - if len(parts) == 1: - return pd.NA - elif len(parts) == 2: - local_name, group = parts - return group - return pd.NA - - -def filter_dst_by_freq(elements, freq=1): - counter = Counter(elements["dst"]) - allowed = {item for item, count in counter.items() if count >= freq} - target = elements.query("dst in @allowed", local_dict={"allowed": allowed}) - return target - - -def create_train_val_test_masks(nodes, train_idx, val_idx, test_idx): - nodes['train_mask'] = True - # nodes.loc[train_idx, 'train_mask'] = True - nodes['val_mask'] = False - nodes.loc[val_idx, 'val_mask'] = True - nodes['test_mask'] = False - nodes.loc[test_idx, 'test_mask'] = True - nodes['train_mask'] = nodes['train_mask'] ^ (nodes['val_mask'] | nodes['test_mask']) - starts_with = lambda x: x.startswith("##node_type") - nodes.loc[nodes.eval("name.map(@starts_with)", local_dict={"starts_with": starts_with}), ['train_mask', 'val_mask', 'test_mask']] = False - - -def get_train_val_test_indices(indices, train_frac=0.6, random_seed=None): - if random_seed is not None: - numpy.random.seed(random_seed) - logging.warning("Random state for splitting dataset is fixed") - else: - logging.info("Random state is not set") - - indices = indices.to_numpy() - - numpy.random.shuffle(indices) - - train = int(indices.size * train_frac) - test = int(indices.size * (train_frac + (1 - train_frac) / 2)) - - logging.info( - f"Splitting into train {train}, validation {test - train}, and test {indices.size - test} sets" - ) - - return indices[:train], indices[train: test], indices[test:] - - -def get_train_val_test_indices_on_packages(nodes, package_names, train_frac=0.6, random_seed=None): - if random_seed is not None: - numpy.random.seed(random_seed) - logging.warning("Random state for splitting dataset is fixed") - else: - logging.info("Random state is not set") - - nodes = nodes.copy() - - package_names = [name.replace("\n", "").replace("-","_").replace(".","_") for name in package_names] - - package_names = numpy.array(package_names) - numpy.random.shuffle(package_names) - - train = int(package_names.size * train_frac) - test = int(package_names.size * (train_frac + (1 - train_frac) / 2)) - - logging.info( - f"Splitting into train {train}, validation {test - train}, and test {package_names.size - test} packages" - ) - - train, valid, test = package_names[:train], package_names[train: test], package_names[test:] - - train = set(train.tolist()) - valid = set(valid.tolist()) - test = set(test.tolist()) - - nodes["node_package_names"] = nodes["name"].map(lambda name: name.split(".")[0]) - - def get_split_indices(split): - global_types = {val for _, val in node_types.items()} - - def is_global_type(type): - return type in global_types - - def in_split(name): - return name in split - - global_nodes = nodes.query( - "node_package_names.map(@in_split) and type_backup.map(@is_global_type)", - local_dict={"in_split": in_split, "is_global_type": is_global_type} - )["id"] - global_nodes = set(global_nodes.tolist()) - - def in_global(node_id): - return node_id in global_nodes - - ast_nodes = nodes.query("mentioned_in.map(@in_global)", local_dict={"in_global": in_global})["id"] - - split_nodes = global_nodes | set(ast_nodes.tolist()) - - def nodes_in_split(node_id): - return node_id in split_nodes - - return nodes.query("id.map(@nodes_in_split)", local_dict={"nodes_in_split": nodes_in_split}).index - - return get_split_indices(train), get_split_indices(valid), get_split_indices(test) - - -def get_global_edges(): - """ - :return: Set of global edges and their reverses - """ - from SourceCodeTools.code.data.sourcetrail.sourcetrail_types import special_mapping, node_types - types = set() - - for key, value in special_mapping.items(): - types.add(key) - types.add(value) - - for _, value in node_types.items(): - types.add(value + "_name") - - return types - - -class SourceGraphDataset: - g = None - nodes = None - edges = None - node_types = None - edge_types = None - - train_frac = None - random_seed = None - labels_from = None - use_node_types = None - use_edge_types = None - filter = None - self_loops = None - - def __init__(self, data_path, - label_from, use_node_types=False, - use_edge_types=False, filter=None, self_loops=False, - train_frac=0.6, random_seed=None, tokenizer_path=None, min_count_for_objectives=1, - no_global_edges=False, remove_reverse=False, package_names=None): - """ - Prepares the data for training GNN model. The graph is prepared in the following way: - 1. Edges are split into the train set and holdout set. Holdout set is used in the future experiments. - Without holdout set the results of the future experiments may be biased. After removing holdout edges - from the main graph, the disconnected nodes are filtered, so that he graph remain connected. - 2. Since training objective will be defined on the node embeddings, the nodes are split into train, test, - and validation sets. The test set should be used in the future experiments for training. Validation and - test sets are equal in size and constitute 40% of all nodes. - 3. The default label is assumed to be node type. Node types can be incorporated into the model by setting - node_types flag to True. - 4. Graphs require contiguous indexing of nodes. For this reason additional mapping is created that tracks - the relationship between the new graph id and the original node id from the training data. - :param nodes_path: path to csv or compressed csv for nodes witch columns - "id", "type", {"name", "serialized_name"}, {any column with labels} - :param edges_path: path to csv or compressed csv for edges with columns - "id", "type", {"source_node_id", "src"}, {"target_node_id", "dst"} - :param label_from: the column where the labels are taken from - :param use_node_types: boolean value, whether to use node types or not - (node-heterogeneous graph} - :param use_edge_types: boolean value, whether to use edge types or not - (edge-heterogeneous graph} - :param filter: list[str], the types of edges to filter from graph - """ - - self.random_seed = random_seed - self.nodes_have_types = use_node_types - self.edges_have_types = use_edge_types - self.labels_from = label_from - self.data_path = data_path - self.tokenizer_path = tokenizer_path - self.min_count_for_objectives = min_count_for_objectives - self.no_global_edges = no_global_edges - self.remove_reverse = remove_reverse - - nodes_path = join(data_path, "nodes.bz2") - edges_path = join(data_path, "edges.bz2") - - self.nodes, self.edges = load_data(nodes_path, edges_path) - - # index is later used for sampling and is assumed to be unique - assert len(self.nodes) == len(self.nodes.index.unique()) - assert len(self.edges) == len(self.edges.index.unique()) - - if self_loops: - self.nodes, self.edges = SourceGraphDataset.assess_need_for_self_loops(self.nodes, self.edges) - - if filter is not None: - for e_type in filter.split(","): - logging.info(f"Filtering edge type {e_type}") - self.edges = self.edges.query(f"type != '{e_type}'") - - if self.remove_reverse: - self.remove_reverse_edges() - - if self.no_global_edges: - self.remove_global_edges() - - if use_node_types is False and use_edge_types is False: - new_nodes, new_edges = self.create_nodetype_edges() - self.nodes = self.nodes.append(new_nodes, ignore_index=True) - self.edges = self.edges.append(new_edges, ignore_index=True) - - self.nodes['type_backup'] = self.nodes['type'] - if not self.nodes_have_types: - self.nodes['type'] = "node_" - self.nodes = self.nodes.astype({'type': 'category'}) - - self.add_embeddable_flag() - - # need to do this to avoid issues insode dgl library - self.edges['type'] = self.edges['type'].apply(lambda x: f"{x}_") - self.edges['type_backup'] = self.edges['type'] - if not self.edges_have_types: - self.edges['type'] = "edge_" - self.edges = self.edges.astype({'type': 'category'}) - - # compact labels - # self.nodes['label'] = self.nodes[label_from] - # self.nodes = self.nodes.astype({'label': 'category'}) - # self.label_map = compact_property(self.nodes['label']) - # assert any(pandas.isna(self.nodes['label'])) is False - - logging.info(f"Unique nodes: {len(self.nodes)}, node types: {len(self.nodes['type'].unique())}") - logging.info(f"Unique edges: {len(self.edges)}, edge types: {len(self.edges['type'].unique())}") - - # self.nodes, self.label_map = self.add_compact_labels() - self.add_typed_ids() - - self.add_splits(train_frac=train_frac, package_names=package_names) - - # self.mark_leaf_nodes() - - self.create_hetero_graph() - - self.update_global_id() - - self.nodes.sort_values('global_graph_id', inplace=True) - - # self.splits = SourceGraphDataset.get_global_graph_id_splits(self.nodes) - - @classmethod - def get_global_graph_id_splits(cls, nodes): - - splits = ( - nodes.query("train_mask == True")['global_graph_id'].values, - nodes.query("val_mask == True")['global_graph_id'].values, - nodes.query("test_mask == True")['global_graph_id'].values, - ) - - return splits - - def compress_node_types(self): - node_type_map = compact_property(self.nodes['type']) - self.node_types = pd.DataFrame( - {"str_type": k, "int_type": v} for k, v in compact_property(self.nodes['type']).items() - ) - - self.nodes['type'] = self.nodes['type'].apply(lambda x: node_type_map[x]) - - def compress_edge_types(self): - edge_type_map = compact_property(self.edges['type']) - self.edge_types = pd.DataFrame( - {"str_type": k, "int_type": v} for k, v in compact_property(self.edges['type']).items() - ) - - self.edges['type'] = self.edges['type'].apply(lambda x: edge_type_map[x]) - - def add_embeddable_flag(self): - embeddable_types = PythonSharedNodes.shared_node_types - - if len(self.nodes.query("type_backup == 'subword'")) > 0: - # some of the types should not be embedded if subwords were generated - embeddable_types = embeddable_types - PythonSharedNodes.tokenizable_types - - embeddable_types |= {"node_type"} - - # self.nodes['embeddable'] = False - self.nodes.eval( - "embeddable = type_backup in @embeddable_types", - local_dict={"embeddable_types": embeddable_types}, - inplace=True - ) - - def op_tokens(self): - if self.tokenizer_path is None: - from SourceCodeTools.code.python_tokens_to_bpe_subwords import python_ops_to_bpe - logging.info("Using heuristic tokenization for ops") - - # def op_tokenize(op_name): - # return python_ops_to_bpe[op_name] if op_name in python_ops_to_bpe else None - return python_ops_to_bpe - else: - # from SourceCodeTools.code.python_tokens_to_bpe_subwords import op_tokenize_or_none - - tokenizer = make_tokenizer(load_bpe_model(self.tokenizer_path)) - - # def op_tokenize(op_name): - # return op_tokenize_or_none(op_name, tokenizer) - - from SourceCodeTools.code.python_tokens_to_bpe_subwords import python_ops_to_literal - return { - op_name: tokenizer(op_literal) - for op_name, op_literal in python_ops_to_literal.items() - } - - # self.nodes.eval("name_alter_tokens = name.map(@op_tokenize)", - # local_dict={"op_tokenize": op_tokenize}, inplace=True) - - def add_splits(self, train_frac, package_names=None): - """ - Generates train, validation, and test masks - Store the masks is pandas table for nodes - :param train_frac: - :return: - """ - - assert len(self.nodes.index) == self.nodes.index.max() + 1 - # generate splits for all nodes, additional filtering will be applied later - # by an objective - - if package_names is None: - splits = get_train_val_test_indices( - self.nodes.index, - train_frac=train_frac, random_seed=self.random_seed - ) - else: - splits = get_train_val_test_indices_on_packages( - self.nodes, package_names, - train_frac=train_frac, random_seed=self.random_seed - ) - - create_train_val_test_masks(self.nodes, *splits) - - def add_typed_ids(self): - nodes = self.nodes.copy() - - typed_id_map = {} - - # node_types = dict(zip(self.node_types['int_type'], self.node_types['str_type'])) - - for type in nodes['type'].unique(): - # need to use indexes because will need to reference - # the original table - type_ind = nodes[nodes['type'] == type].index - - id_map = compact_property(nodes.loc[type_ind, 'id']) - - nodes.loc[type_ind, 'typed_id'] = nodes.loc[type_ind, 'id'].apply(lambda old_id: id_map[old_id]) - - # typed_id_map[node_types[type]] = id_map - typed_id_map[type] = id_map - - assert any(pandas.isna(nodes['typed_id'])) is False - - nodes = nodes.astype({"typed_id": "int"}) - - self.nodes, self.typed_id_map = nodes, typed_id_map - # return nodes, typed_id_map - - # def add_compact_labels(self): - # nodes = self.nodes.copy() - # label_map = compact_property(nodes['label']) - # nodes['compact_label'] = nodes['label'].apply(lambda old_id: label_map[old_id]) - # return nodes, label_map - - def add_node_types_to_edges(self, nodes, edges): - - # nodes = self.nodes - # edges = self.edges.copy() - - node_type_map = dict(zip(nodes['id'].values, nodes['type'])) - - edges['src_type'] = edges['src'].apply(lambda src_id: node_type_map[src_id]) - edges['dst_type'] = edges['dst'].apply(lambda dst_id: node_type_map[dst_id]) - edges = edges.astype({'src_type': 'category', 'dst_type': 'category'}) - - return edges - - def create_nodetype_edges(self): - node_new_id = self.nodes["id"].max() + 1 - edge_new_id = self.edges["id"].max() + 1 - - new_nodes = [] - new_edges = [] - added_type_nodes = {} - - node_slice = self.nodes[["id", "type"]].values - - for id, type in node_slice: - if type not in added_type_nodes: - added_type_nodes[type] = node_new_id - node_new_id += 1 - - new_nodes.append({ - "id": added_type_nodes[type], - "name": f"##node_type_{type}", - "type": "node_type", - "mentioned_in": pd.NA - }) - - new_edges.append({ - "id": edge_new_id, - "type": "node_type", - "src": added_type_nodes[type], - "dst": id, - "file_id": pd.NA, - "mentioned_in": pd.NA - }) - edge_new_id += 1 - - return pd.DataFrame(new_nodes), pd.DataFrame(new_edges) - - def remove_ast_edges(self): - global_edges = get_global_edges() - global_edges.add("subword") - is_global = lambda type: type in global_edges - edges = self.edges.query("type_backup.map(@is_global)", local_dict={"is_global": is_global}) - self.nodes, self.edges = ensure_connectedness(self.nodes, edges) - - def remove_global_edges(self): - global_edges = get_global_edges() - global_edges.add("global_mention") - is_ast = lambda type: type not in global_edges - edges = self.edges.query("type.map(@is_ast)", local_dict={"is_ast": is_ast}) - self.edges = edges - # self.nodes, self.edges = ensure_connectedness(self.nodes, edges) - - def remove_reverse_edges(self): - from SourceCodeTools.code.data.sourcetrail.sourcetrail_types import special_mapping - global_reverse = {val for _, val in special_mapping.items()} - - not_reverse = lambda type: not (type.endswith("_rev") or type in global_reverse) - edges = self.edges.query("type.map(@not_reverse)", local_dict={"not_reverse": not_reverse}) - self.edges = edges - - def update_global_id(self): - orig_id = [] - graph_id = [] - prev_offset = 0 - - typed_node_id_maps = self.typed_id_map - - for type in self.g.ntypes: - from_id, to_id = zip(*typed_node_id_maps[type].items()) - orig_id.extend(from_id) - graph_id.extend([t + prev_offset for t in to_id]) - prev_offset += self.g.number_of_nodes(type) - - global_map = dict(zip(orig_id, graph_id)) - - self.nodes['global_graph_id'] = self.nodes['id'].apply(lambda old_id: global_map[old_id]) - import torch - for ntype in self.g.ntypes: - self.g.nodes[ntype].data['global_graph_id'] = torch.LongTensor( - list(map(lambda x: global_map[x], self.g.nodes[ntype].data['original_id'].tolist())) - ) - - self.node_id_to_global_id = dict(zip(self.nodes["id"], self.nodes["global_graph_id"])) - - @property - def typed_node_counts(self): - - typed_node_counts = dict() - - unique_types = self.nodes['type'].unique() - - # node_types = dict(zip(self.node_types['int_type'], self.node_types['str_type'])) - - for type_id, type in enumerate(unique_types): - nodes_of_type = len(self.nodes.query(f"type == '{type}'")) - # typed_node_counts[node_types[type]] = nodes_of_type - typed_node_counts[type] = nodes_of_type - - return typed_node_counts - - def create_hetero_graph(self): - - nodes = self.nodes.copy() - edges = self.edges.copy() - edges = self.add_node_types_to_edges(nodes, edges) - - typed_node_id = dict(zip(nodes['id'], nodes['typed_id'])) - - possible_edge_signatures = edges[['src_type', 'type', 'dst_type']].drop_duplicates( - ['src_type', 'type', 'dst_type'] - ) - - # node_types = dict(zip(self.node_types['int_type'], self.node_types['str_type'])) - # edge_types = dict(zip(self.edge_types['int_type'], self.edge_types['str_type'])) - - # typed_subgraphs is a dictionary with subset_signature as a key, - # the dictionary stores directed edge lists - typed_subgraphs = {} - - # node_mapper = lambda old_id: typed_node_id[old_id] - # for src_type, type, dst_type, src, dst in edges[['src_type', 'type', 'dst_type', "src", "dst"]].values: - # subgraph_signature = (src_type, type, dst_type) - # if subgraph_signature in typed_subgraphs: - # typed_subgraphs[subgraph_signature].add((node_mapper(src), node_mapper(dst))) - # else: - # typed_subgraphs[subgraph_signature] = {node_mapper(src), node_mapper(dst)} - - for ind, row in possible_edge_signatures.iterrows(): # - # subgraph_signature = (node_types[row['src_type']], edge_types[row['type']], node_types[row['dst_type']]) - subgraph_signature = (row['src_type'], row['type'], row['dst_type']) - - subset = edges.query( - f"src_type == '{row['src_type']}' and type == '{row['type']}' and dst_type == '{row['dst_type']}'" - ) - - typed_subgraphs[subgraph_signature] = list( - zip( - subset['src'].map(lambda old_id: typed_node_id[old_id]), - subset['dst'].map(lambda old_id: typed_node_id[old_id]) - ) - ) - - logging.info( - f"Unique triplet types in the graph: {len(typed_subgraphs.keys())}" - ) - - import dgl, torch - self.g = dgl.heterograph(typed_subgraphs, self.typed_node_counts) - - # node_types = dict(zip(self.node_types['str_type'], self.node_types['int_type'])) - - for ntype in self.g.ntypes: - # int_type = node_types[ntype] - - node_data = self.nodes.query( - f"type == '{ntype}'" - )[[ - 'typed_id', 'train_mask', 'test_mask', 'val_mask', 'id' # 'compact_label', - ]].sort_values('typed_id') - - self.g.nodes[ntype].data['train_mask'] = torch.tensor(node_data['train_mask'].values, dtype=torch.bool) - self.g.nodes[ntype].data['test_mask'] = torch.tensor(node_data['test_mask'].values, dtype=torch.bool) - self.g.nodes[ntype].data['val_mask'] = torch.tensor(node_data['val_mask'].values, dtype=torch.bool) - # self.g.nodes[ntype].data['labels'] = torch.tensor(node_data['compact_label'].values, dtype=torch.int64) - self.g.nodes[ntype].data['typed_id'] = torch.tensor(node_data['typed_id'].values, dtype=torch.int64) - self.g.nodes[ntype].data['original_id'] = torch.tensor(node_data['id'].values, dtype=torch.int64) - - @classmethod - def assess_need_for_self_loops(cls, nodes, edges): - # this is a hack when where are only outgoing connections from this node type - need_self_loop = set(edges['src'].values.tolist()) - set(edges['dst'].values.tolist()) - for nid in need_self_loop: - edges = edges.append({ - "id": -1, - "type": 99, - "src": nid, - "dst": nid - }, ignore_index=True) - - return nodes, edges - - # @classmethod - # def holdout(cls, nodes, edges, holdout_frac, random_seed): - # """ - # Create a set of holdout edges, ensure that there are no orphan nodes after these edges are removed. - # :param nodes: - # :param edges: - # :param holdout_frac: - # :param random_seed: - # :return: - # """ - # - # train, test = split(edges, holdout_frac, random_seed=random_seed) - # - # nodes, train_edges = ensure_connectedness(nodes, train) - # - # nodes, test_edges = ensure_valid_edges(nodes, test) - # - # return nodes, train_edges, test_edges - - # def mark_leaf_nodes(self): - # leaf_types = {'subword', "Op", "Constant", "Name"} # the last is used in graphs without subwords - # - # self.nodes['is_leaf'] = self.nodes['type_backup'].apply(lambda type_: type_ in leaf_types) - - def get_typed_node_id(self, node_id, node_type): - return self.typed_id_map[node_type][node_id] - - def get_global_node_id(self, node_id, node_type=None): - return self.node_id_to_global_id[node_id] - - def load_node_names(self): - """ - :return: DataFrame that contains mappings from nodes to names that appear more than once in the graph - """ - for_training = self.nodes[ - self.nodes['train_mask'] | self.nodes['test_mask'] | self.nodes['val_mask'] - ][['id', 'type_backup', 'name']]\ - .rename({"name": "serialized_name", "type_backup": "type"}, axis=1) - - node_names = extract_node_names(for_training, 2) - node_names = filter_dst_by_freq(node_names, freq=self.min_count_for_objectives) - - return node_names - # path = join(self.data_path, "node_names.bz2") - # return unpersist(path) - - def load_var_use(self): - """ - :return: DataFrame that contains mapping from function ids to variable names that appear in those functions - """ - path = join(self.data_path, "common_function_variable_pairs.bz2") - var_use = unpersist(path) - var_use = filter_dst_by_freq(var_use, freq=self.min_count_for_objectives) - return var_use - - def load_api_call(self): - path = join(self.data_path, "common_call_seq.bz2") - api_call = unpersist(path) - api_call = filter_dst_by_freq(api_call, freq=self.min_count_for_objectives) - return api_call - - def load_token_prediction(self): - """ - Return names for all nodes that represent local mentions - :return: DataFrame that contains mappings from local mentions to names that these mentions represent. Applies - only to nodes that have subwords and have appeared in a scope (names have `@` in their names) - """ - if self.use_edge_types: - edges = self.edges.query("type == 'subword_'") - else: - edges = self.edges.query("type_backup == 'subword_'") - - target_nodes = set(edges["dst"].to_list()) - target_nodes = self.nodes.query("id in @target_nodes", local_dict={"target_nodes": target_nodes})[["id", "name"]] - - name_extr = lambda x: x.split('@')[0] - # target_nodes.eval("group = name.map(@get_group)", local_dict={"get_group": get_name_group}, inplace=True) - # target_nodes.dropna(axis=0, inplace=True) - target_nodes.eval("name = name.map(@name_extr)", local_dict={"name_extr": name_extr}, inplace=True) - target_nodes.rename({"id": "src", "name": "dst"}, axis=1, inplace=True) - target_nodes = filter_dst_by_freq(target_nodes, freq=self.min_count_for_objectives) - # target_nodes.eval("cooccurr = dst.map(@occ)", local_dict={"occ": lambda name: name_cooccurr_freq.get(name, Counter())}, inplace=True) - - return target_nodes - - def load_global_edges_prediction(self): - - nodes_path = join(self.data_path, "nodes.bz2") - edges_path = join(self.data_path, "edges.bz2") - - _, edges = load_data(nodes_path, edges_path) - - global_edges = get_global_edges() - global_edges = global_edges - {"defines", "defined_in"} # these edges are already in AST? - global_edges.add("global_mention") - - is_global = lambda type: type in global_edges - edges = edges.query("type.map(@is_global)", local_dict={"is_global": is_global}) - - edges.rename( - { - "source_node_id": "src", - "target_node_id": "dst" - }, inplace=True, axis=1 - ) - - return edges[["src", "dst"]] - - def load_docstring(self): - - docstrings_path = os.path.join(self.data_path, "common_source_graph_bodies.bz2") - - dosctrings = unpersist(docstrings_path)[["id", "docstring"]] - - from nltk import sent_tokenize - - def normalize(text): - if text is None or len(text.strip()) == 0: - return pd.NA - return "\n".join(sent_tokenize(text)[:3]).replace("\n", " ") - - dosctrings.eval("docstring = docstring.map(@normalize)", local_dict={"normalize": normalize}, inplace=True) - dosctrings.dropna(axis=0, inplace=True) - - dosctrings.rename({ - "id": "src", - "docstring": "dst" - }, axis=1, inplace=True) - - return dosctrings - - def buckets_from_pretrained_embeddings(self, pretrained_path, n_buckets): - - from SourceCodeTools.nlp.embed.fasttext import load_w2v_map - from SourceCodeTools.nlp import token_hasher - pretrained = load_w2v_map(pretrained_path) - - import numpy as np - - embs_init = np.random.randn(n_buckets, pretrained.n_dims).astype(np.float32) - - for word in pretrained.keys(): - ind = token_hasher(word, n_buckets) - embs_init[ind, :] = pretrained[word] - - def op_embedding(op_tokens): - embedding = None - for token in op_tokens: - token_emb = pretrained.get(token, None) - if embedding is None: - embedding = token_emb - else: - embedding = embedding + token_emb - return embedding - - python_ops_to_bpe = self.op_tokens() - for op, op_tokens in python_ops_to_bpe.items(): - op_emb = op_embedding(op_tokens) - if op_emb is not None: - op_ind = token_hasher(op, n_buckets) - embs_init[op_ind, :] = op_emb - - return embs_init - - def create_subword_masker(self): - """ - :return: SubwordMasker for all nodes that have subwords. Suitable for token prediction objective. - """ - return SubwordMasker(self.nodes, self.edges) - - def create_variable_name_masker(self, tokenizer_path): - """ - :param tokenizer_path: path to bpe tokenizer - :return: SubwordMasker for function nodes. Suitable for variable name use prediction objective - """ - return NodeNameMasker(self.nodes, self.edges, self.load_var_use(), tokenizer_path) - - def create_node_name_masker(self, tokenizer_path): - """ - :param tokenizer_path: path to bpe tokenizer - :return: SubwordMasker for function nodes. Suitable for node name use prediction objective - """ - return NodeNameMasker(self.nodes, self.edges, self.load_node_names(), tokenizer_path) - - -def ensure_connectedness(nodes: pandas.DataFrame, edges: pandas.DataFrame): - """ - Filtering isolated nodes - :param nodes: DataFrame - :param edges: DataFrame - :return: - """ - - logging.info( - f"Filtering isolated nodes. " - f"Starting from {nodes.shape[0]} nodes and {edges.shape[0]} edges...", - ) - unique_nodes = set(edges['src'].values.tolist() + - edges['dst'].values.tolist()) - - nodes = nodes[ - nodes['id'].apply(lambda nid: nid in unique_nodes) - ] - - logging.info( - f"Ending up with {nodes.shape[0]} nodes and {edges.shape[0]} edges" - ) - - return nodes, edges - - -def ensure_valid_edges(nodes, edges, ignore_src=False): - """ - Filter edges that link to nodes that do not exist - :param nodes: - :param edges: - :param ignore_src: - :return: - """ - print( - f"Filtering edges to invalid nodes. " - f"Starting from {nodes.shape[0]} nodes and {edges.shape[0]} edges...", - end="" - ) - - unique_nodes = set(nodes['id'].values.tolist()) - - if not ignore_src: - edges = edges[ - edges['src'].apply(lambda nid: nid in unique_nodes) - ] - - edges = edges[ - edges['dst'].apply(lambda nid: nid in unique_nodes) - ] - - print( - f"ending up with {nodes.shape[0]} nodes and {edges.shape[0]} edges" - ) - - return nodes, edges - - -def read_or_create_dataset(args, model_base, labels_from="type"): - if args.restore_state: - # i'm not happy with this behaviour that differs based on the flag status - dataset = pickle.load(open(join(model_base, "dataset.pkl"), "rb")) - else: - dataset = SourceGraphDataset( - # args.node_path, args.edge_path, - args.data_path, - label_from=labels_from, - use_node_types=args.use_node_types, - use_edge_types=args.use_edge_types, - filter=args.filter_edges, - self_loops=args.self_loops, - train_frac=args.train_frac, - tokenizer_path=args.tokenizer, - random_seed=args.random_seed, - min_count_for_objectives=args.min_count_for_objectives, - no_global_edges=args.no_global_edges, - remove_reverse=args.remove_reverse, - package_names=open(args.packages_file).readlines() if args.packages_file is not None else None - ) - - # save dataset state for recovery - pickle.dump(dataset, open(join(model_base, "dataset.pkl"), "wb")) - - return dataset - - -def test_dataset(): - import sys - - data_path = sys.argv[1] - # nodes_path = sys.argv[1] - # edges_path = sys.argv[2] - - dataset = SourceGraphDataset( - data_path, - # nodes_path, edges_path, - label_from='type', - use_node_types=False, - use_edge_types=True, - ) - - # sm = dataset.create_subword_masker() - print(dataset) - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(module)s:%(lineno)d:%(message)s") - test_dataset() diff --git a/SourceCodeTools/code/data/sourcetrail/DatasetCreator2.py b/SourceCodeTools/code/data/sourcetrail/DatasetCreator2.py index 88217468..dd69c271 100644 --- a/SourceCodeTools/code/data/sourcetrail/DatasetCreator2.py +++ b/SourceCodeTools/code/data/sourcetrail/DatasetCreator2.py @@ -1,8 +1,14 @@ +import logging +import os from os.path import join -from tqdm import tqdm -from SourceCodeTools.code.data.sourcetrail.common import map_offsets -from SourceCodeTools.code.data.sourcetrail.sourcetrail_map_id_columns import map_columns +from SourceCodeTools.cli_arguments import DatasetCreatorArguments +from SourceCodeTools.code.annotator_utils import map_offsets +from SourceCodeTools.code.common import read_nodes +from SourceCodeTools.code.data.AbstractDatasetCreator import AbstractDatasetCreator +from SourceCodeTools.code.data.ast_graph.filter_type_edges import filter_type_edges_with_chunks +from SourceCodeTools.code.data.file_utils import filenames, unpersist_if_present, read_element_component +from SourceCodeTools.code.data.sourcetrail.sourcetrail_filter_type_edges import filter_type_edges from SourceCodeTools.code.data.sourcetrail.sourcetrail_merge_graphs import get_global_node_info, merge_global_with_local from SourceCodeTools.code.data.sourcetrail.sourcetrail_node_local2global import get_local2global from SourceCodeTools.code.data.sourcetrail.sourcetrail_node_name_merge import merge_names @@ -15,179 +21,85 @@ from SourceCodeTools.code.data.sourcetrail.sourcetrail_extract_variable_names import extract_var_names from SourceCodeTools.code.data.sourcetrail.sourcetrail_extract_node_names import extract_node_names -from SourceCodeTools.code.data.sourcetrail.file_utils import * +class DatasetCreator(AbstractDatasetCreator): + """ + Merges several environments indexed with Sourcetrail into a single graph. + """ + + merging_specification = { + "nodes.bz2": {"columns": ['id'], "output_path": "common_nodes.json", "ensure_unique_with": ['type', 'serialized_name']}, + "edges.bz2": {"columns": ['target_node_id', 'source_node_id'], "output_path": "common_edges.json"}, + "bodies.bz2": {"columns": ['id'], "output_path": "common_bodies.json", "columns_special": [("replacement_list", map_offsets)]}, + "function_variable_pairs.bz2": {"columns": ['src'], "output_path": "common_function_variable_pairs.json"}, + "call_seq.bz2": {"columns": ['src', 'dst'], "output_path": "common_call_seq.json"}, + + "nodes_with_ast.bz2": {"columns": ['id', 'mentioned_in'], "output_path": "common_nodes.json", "ensure_unique_with": ['type', 'serialized_name']}, + "edges_with_ast.bz2": {"columns": ['target_node_id', 'source_node_id', 'mentioned_in'], "output_path": "common_edges.json"}, + "offsets.bz2": {"columns": ['node_id'], "output_path": "common_offsets.json", "columns_special": [("mentioned_in", map_offsets)]}, + "filecontent_with_package.bz2": {"columns": [], "output_path": "common_filecontent.json"}, + "name_mappings.bz2": {"columns": [], "output_path": "common_name_mappings.json"}, + } + + files_for_merging = [ + "nodes.bz2", "edges.bz2", "bodies.bz2", "function_variable_pairs.bz2", "call_seq.bz2" + ] + files_for_merging_with_ast = [ + "nodes_with_ast.bz2", "edges_with_ast.bz2", "bodies.bz2", "function_variable_pairs.bz2", + "call_seq.bz2", "offsets.bz2", "filecontent_with_package.bz2", "name_mappings.bz2" + ] + + edge_priority = { + "next": -1, "prev": -1, "global_mention": -1, "global_mention_rev": -1, + "calls": 0, + "called_by": 0, + "defines": 1, + "defined_in": 1, + "inheritance": 1, + "inherited_by": 1, + "imports": 1, + "imported_by": 1, + "uses": 2, + "used_by": 2, + "uses_type": 2, + "type_used_by": 2, + "mention_scope": 10, + "mention_scope_rev": 10, + "defined_in_function": 4, + "defined_in_function_rev": 4, + "defined_in_class": 5, + "defined_in_class_rev": 5, + "defined_in_module": 6, + "defined_in_module_rev": 6 + } + + restricted_edges = {"global_mention_rev"} + restricted_in_types = { + "Op", "Constant", "#attr#", "#keyword#", + 'CtlFlow', 'JoinedStr', 'Name', 'ast_Literal', + 'subword', 'type_annotation' + } + + type_annotation_edge_types = ['annotation_for', 'returned_by'] -class DatasetCreator: def __init__( self, path, lang, bpe_tokenizer, create_subword_instances, connect_subwords, only_with_annotations, - do_extraction=False, visualize=False, track_offsets=False + do_extraction=False, visualize=False, track_offsets=False, remove_type_annotations=False, + recompute_l2g=False ): - self.indexed_path = path - self.lang = lang - self.bpe_tokenizer = bpe_tokenizer - self.create_subword_instances = create_subword_instances - self.connect_subwords = connect_subwords - self.only_with_annotations = only_with_annotations - self.extract = do_extraction - self.visualize = visualize - self.track_offsets = track_offsets - - paths = (os.path.join(path, dir) for dir in os.listdir(path)) - self.environments = sorted(list(filter(lambda path: os.path.isdir(path), paths)), key=lambda x: x.lower()) - - self.local2global_cache = {} + super().__init__( + path, lang, bpe_tokenizer, create_subword_instances, connect_subwords, only_with_annotations, + do_extraction, visualize, track_offsets, remove_type_annotations, recompute_l2g + ) from SourceCodeTools.code.data.sourcetrail.common import UNRESOLVED_SYMBOL self.unsolved_symbol = UNRESOLVED_SYMBOL - def merge(self, output_directory): - - if self.extract: - logging.info("Extracting...") - self.do_extraction() - - no_ast_path, with_ast_path = self.create_output_dirs(output_directory) - - if not self.only_with_annotations: - self.create_global_file("nodes.bz2", "local2global.bz2", ['id'], - join(no_ast_path, "common_nodes.bz2"), message="Merging nodes", ensure_unique_with=['type', 'serialized_name']) - self.create_global_file("edges.bz2", "local2global.bz2", ['target_node_id', 'source_node_id'], - join(no_ast_path, "common_edges.bz2"), message="Merging edges") - self.create_global_file("source_graph_bodies.bz2", "local2global.bz2", ['id'], - join(no_ast_path, "common_source_graph_bodies.bz2"), "Merging bodies", columns_special=[("replacement_list", map_offsets)]) - self.create_global_file("function_variable_pairs.bz2", "local2global.bz2", ['src'], - join(no_ast_path, "common_function_variable_pairs.bz2"), "Merging variables") - self.create_global_file("call_seq.bz2", "local2global.bz2", ['src', 'dst'], - join(no_ast_path, "common_call_seq.bz2"), "Merging call seq") - # self.create_global_file("name_groups.bz2", "local2global.bz2", [], - # join(no_ast_path, "name_groups.bz2"), "Merging name groups") - - global_nodes = self.filter_orphaned_nodes( - unpersist(join(no_ast_path, "common_nodes.bz2")), no_ast_path - ) - persist(global_nodes, join(no_ast_path, "common_nodes.bz2")) - node_names = extract_node_names( - global_nodes, min_count=2 - ) - if node_names is not None: - persist(node_names, join(no_ast_path, "node_names.bz2")) - - if self.visualize: - self.visualize_func( - unpersist(join(no_ast_path, "common_nodes.bz2")), - unpersist(join(no_ast_path, "common_edges.bz2")), - join(no_ast_path, "visualization.pdf") - ) - - self.create_global_file("nodes_with_ast.bz2", "local2global_with_ast.bz2", ['id', 'mentioned_in'], - join(with_ast_path, "common_nodes.bz2"), message="Merging nodes with ast", ensure_unique_with=['type', 'serialized_name']) - self.create_global_file("edges_with_ast.bz2", "local2global_with_ast.bz2", ['target_node_id', 'source_node_id', 'mentioned_in'], - join(with_ast_path, "common_edges.bz2"), "Merging edges with ast") - self.create_global_file("source_graph_bodies.bz2", "local2global_with_ast.bz2", ['id'], - join(with_ast_path, "common_source_graph_bodies.bz2"), "Merging bodies with ast", columns_special=[("replacement_list", map_offsets)]) - self.create_global_file("function_variable_pairs.bz2", "local2global_with_ast.bz2", ['src'], - join(with_ast_path, "common_function_variable_pairs.bz2"), "Merging variables with ast") - self.create_global_file("call_seq.bz2", "local2global_with_ast.bz2", ['src', 'dst'], - join(with_ast_path, "common_call_seq.bz2"), "Merging call seq with ast") - # self.create_global_file("name_groups.bz2", "local2global_with_ast.bz2", [], - # join(with_ast_path, "name_groups.bz2"), "Merging name groups") - - global_nodes = self.filter_orphaned_nodes( - unpersist(join(with_ast_path, "common_nodes.bz2")), with_ast_path - ) - persist(global_nodes, join(with_ast_path, "common_nodes.bz2")) - node_names = extract_node_names( - global_nodes, min_count=2 - ) - if node_names is not None: - persist(node_names, join(with_ast_path, "node_names.bz2")) - - if self.visualize: - self.visualize_func( - unpersist(join(with_ast_path, "common_nodes.bz2")), - unpersist(join(with_ast_path, "common_edges.bz2")), - join(with_ast_path, "visualization.pdf") - ) - - - def do_extraction(self): - global_nodes = None - global_nodes_with_ast = None - name_groups = None - - for env_path in self.environments: - logging.info(f"Found {os.path.basename(env_path)}") - - if not self.is_indexed(env_path): - logging.info("Package not indexed") - continue - - nodes, edges, source_location, occurrence, filecontent, element_component = \ - self.read_sourcetrail_files(env_path) - - if nodes is None: - logging.info("Index is empty") - continue - - edges = filter_ambiguous_edges(edges, element_component) - - nodes, edges = self.filter_unsolved_symbols(nodes, edges) - - bodies = process_bodies(nodes, edges, source_location, occurrence, filecontent, self.lang) - call_seq = extract_call_seq(nodes, edges, source_location, occurrence) - - edges = add_reverse_edges(edges) - - # if bodies is not None: - ast_nodes, ast_edges, offsets, name_group_tracker = get_ast_from_modules( - nodes, edges, source_location, occurrence, filecontent, - self.bpe_tokenizer, self.create_subword_instances, self.connect_subwords, self.lang, - track_offsets=self.track_offsets - ) - nodes_with_ast = nodes.append(ast_nodes) - edges_with_ast = edges.append(ast_edges) - if bodies is not None: - vars = extract_var_names(nodes, bodies, self.lang) - else: - vars = None - if name_groups is None: - name_groups = name_group_tracker - else: - name_groups = name_groups.append(name_group_tracker) - # else: - # nodes_with_ast = nodes - # edges_with_ast = edges - # vars = None - # offsets = None - - global_nodes = self.merge_with_global(global_nodes, nodes) - global_nodes_with_ast = self.merge_with_global(global_nodes_with_ast, nodes_with_ast) - - local2global = get_local2global( - global_nodes=global_nodes, local_nodes=nodes - ) - local2global_with_ast = get_local2global( - global_nodes=global_nodes_with_ast, local_nodes=nodes_with_ast - ) - - self.write_local(env_path, nodes, edges, bodies, call_seq, vars, - nodes_with_ast, edges_with_ast, offsets, - local2global, local2global_with_ast, name_groups) - - def get_local2global(self, path): - if path in self.local2global_cache: - return self.local2global_cache[path] - else: - local2global_df = unpersist_if_present(path) - if local2global_df is None: - return None - else: - local2global = dict(zip(local2global_df['id'], local2global_df['global_id'])) - self.local2global_cache[path] = local2global - return local2global + def _prepare_environments(self): + paths = (os.path.join(self.path, dir) for dir in os.listdir(self.path)) + self.environments = sorted(list(filter(lambda path: os.path.isdir(path), paths)), key=lambda x: x.lower()) def create_output_dirs(self, output_path): if not os.path.isdir(output_path): @@ -204,14 +116,16 @@ def create_output_dirs(self, output_path): return no_ast_path, with_ast_path - def is_indexed(self, path): + @staticmethod + def is_indexed(path): basename = os.path.basename(path) if os.path.isfile(os.path.join(path, f"{basename}.srctrldb")): return True else: return False - def get_csv_name(self, name, path): + @staticmethod + def get_csv_name(name, path): return os.path.join(path, filenames[name]) def filter_unsolved_symbols(self, nodes, edges): @@ -237,31 +151,11 @@ def read_sourcetrail_files(self, env_path): else: return nodes, edges, source_location, occurrence, filecontent, element_component - def write_local(self, dir, nodes, edges, bodies, call_seq, vars, - nodes_with_ast, edges_with_ast, offsets, - local2global, local2global_with_ast, name_groups): - write_nodes(nodes, dir) - write_edges(edges, dir) - if bodies is not None: - write_processed_bodies(bodies, dir) - if call_seq is not None: - persist(call_seq, join(dir, filenames['call_seq'])) - if vars is not None: - persist(vars, join(dir, filenames['function_variable_pairs'])) - persist(nodes_with_ast, join(dir, "nodes_with_ast.bz2")) - persist(edges_with_ast, join(dir, "edges_with_ast.bz2")) - if offsets is not None: - persist(offsets, join(dir, "offsets.bz2")) - if len(edges_with_ast.query("type == 'annotation_for' or type == 'returned_by'")) > 0: - with open(join(dir, "has_annotations"), "w") as has_annotations: - pass - persist(local2global, join(dir, "local2global.bz2")) - persist(local2global_with_ast, join(dir, "local2global_with_ast.bz2")) - - if name_groups is not None: - persist(name_groups, join(dir, "name_groups.bz2")) - def get_global_node_info(self, global_nodes): + """ + :param global_nodes: nodes from a global merged graph + :return: Set of existing nodes represented with (type, node_name), minimal available free id + """ if global_nodes is None: existing_nodes, next_valid_id = set(), 0 else: @@ -269,6 +163,12 @@ def get_global_node_info(self, global_nodes): return existing_nodes, next_valid_id def merge_with_global(self, global_nodes, local_nodes): + """ + Merge nodes obtained from the source code with the previously existing nodes. + :param global_nodes: Nodes from a global inter-package graph + :param local_nodes: Nodes from a local file-level graph + :return: Updated version of the global inter-package graph + """ existing_nodes, next_valid_id = self.get_global_node_info(global_nodes) new_nodes = merge_global_with_local(existing_nodes, next_valid_id, local_nodes) @@ -279,80 +179,127 @@ def merge_with_global(self, global_nodes, local_nodes): return global_nodes - def merge_files(self, env_path, filename, map_filename, columns_to_map, original, columns_special=None): - input_table_path = join(env_path, filename) - local2global = self.get_local2global(join(env_path, map_filename)) - if os.path.isfile(input_table_path) and local2global is not None: - input_table = unpersist(input_table_path) - if self.only_with_annotations: - if not os.path.isfile(join(env_path, "has_annotations")): - return original - new_table = map_columns(input_table, local2global, columns_to_map, columns_special=columns_special) - if original is None: - return new_table + def do_extraction(self): + global_nodes = set() + global_nodes_with_ast = set() + + for env_path in self.environments: + logging.info(f"Found {os.path.basename(env_path)}") + + if not self.is_indexed(env_path): + logging.info("Package not indexed") + continue + + if not self.recompute_l2g: + + nodes, edges, source_location, occurrence, filecontent, element_component = \ + self.read_sourcetrail_files(env_path) + + if nodes is None: + logging.info("Index is empty") + continue + + edges = filter_ambiguous_edges(edges, element_component) + + nodes, edges = self.filter_unsolved_symbols(nodes, edges) + + bodies = process_bodies(nodes, edges, source_location, occurrence, filecontent, self.lang) + call_seq = extract_call_seq(nodes, edges, source_location, occurrence) + + edges = add_reverse_edges(edges) + + # if bodies is not None: + ast_nodes, ast_edges, offsets, name_mappings = get_ast_from_modules( + nodes, edges, source_location, occurrence, filecontent, + self.bpe_tokenizer, self.create_subword_instances, self.connect_subwords, self.lang, + track_offsets=self.track_offsets + ) + + if offsets is not None: + offsets["package"] = os.path.basename(env_path) + filecontent["package"] = os.path.basename(env_path) + + # need this check in situations when module has a single file and this file cannot be parsed + nodes_with_ast = nodes.append(ast_nodes) if ast_nodes is not None else nodes + edges_with_ast = edges.append(ast_edges) if ast_edges is not None else edges + + if bodies is not None: + vars = extract_var_names(nodes, bodies, self.lang) + else: + vars = None else: - return original.append(new_table) - else: - return original - - def create_global_file(self, local_file, local2global_file, columns, output_path, message, ensure_unique_with=None, columns_special=None): - global_table = None - for ind, env_path in tqdm( - enumerate(self.environments), desc=message, leave=True, - dynamic_ncols=True, total=len(self.environments) - ): - global_table = self.merge_files( - env_path, local_file, local2global_file, columns, global_table, columns_special=columns_special + nodes = unpersist_if_present(join(env_path, "nodes.bz2")) + nodes_with_ast = unpersist_if_present(join(env_path, "nodes_with_ast.bz2")) + + if nodes is None or nodes_with_ast is None: + continue + + edges = bodies = call_seq = vars = edges_with_ast = offsets = name_mappings = filecontent = None + + # global_nodes = self.merge_with_global(global_nodes, nodes) + # global_nodes_with_ast = self.merge_with_global(global_nodes_with_ast, nodes_with_ast) + + local2global = get_local2global( + global_nodes=global_nodes, local_nodes=nodes + ) + local2global_with_ast = get_local2global( + global_nodes=global_nodes_with_ast, local_nodes=nodes_with_ast ) - if ensure_unique_with is not None: - global_table = global_table.drop_duplicates(subset=ensure_unique_with) + global_nodes.update(local2global["global_id"]) + global_nodes_with_ast.update(local2global_with_ast["global_id"]) - if global_table is not None: - global_table.reset_index(drop=True, inplace=True) - assert len(global_table) == len(global_table.index.unique()) + self.write_type_annotation_flag(edges_with_ast, env_path) - persist(global_table, output_path) + self.write_local( + env_path, nodes=nodes, edges=edges, bodies=bodies, call_seq=call_seq, function_variable_pairs=vars, + nodes_with_ast=nodes_with_ast, edges_with_ast=edges_with_ast, offsets=offsets, + local2global=local2global, local2global_with_ast=local2global_with_ast, + name_mappings=name_mappings, filecontent_with_package=filecontent + ) - def filter_orphaned_nodes(self, global_nodes, output_dir): - edges = unpersist(join(output_dir, "common_edges.bz2")) - active_nodes = set(edges['source_node_id'].tolist() + edges['target_node_id'].tolist()) - global_nodes = global_nodes[ - global_nodes['id'].apply(lambda id_: id_ in active_nodes) - ] - return global_nodes + self.compact_mapping_for_l2g(global_nodes, "local2global.bz2") + self.compact_mapping_for_l2g(global_nodes_with_ast, "local2global_with_ast.bz2") + + @staticmethod + def extract_node_names(nodes_path, min_count): + logging.info("Extract node names") + return extract_node_names(read_nodes(nodes_path), min_count=min_count) + + def filter_type_edges(self, nodes_path, edges_path): + logging.info("Filter type edges") + filter_type_edges_with_chunks(nodes_path, edges_path, kwarg_fn=self.get_writing_mode) + + def merge(self, output_directory): + + if self.extract: + logging.info("Extracting...") + self.do_extraction() + + no_ast_path, with_ast_path = self.create_output_dirs(output_directory) + + if not self.only_with_annotations: + self.merge_graph_without_ast(no_ast_path) + + self.merge_graph_with_ast(with_ast_path) def visualize_func(self, nodes, edges, output_path): from SourceCodeTools.code.data.sourcetrail.sourcetrail_draw_graph import visualize visualize(nodes, edges, output_path) + if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(description='Merge indexed environments into a single graph') - parser.add_argument('indexed_environments', - help='Path to environments indexed by sourcetrail') - parser.add_argument('output_directory', - help='') - parser.add_argument('--language', "-l", dest="language", default="python", - help='Path to environments indexed by sourcetrail') - parser.add_argument('--bpe_tokenizer', '-bpe', dest='bpe_tokenizer', type=str, - help='') - parser.add_argument('--create_subword_instances', action='store_true', default=False, help="") - parser.add_argument('--connect_subwords', action='store_true', default=False, - help="Takes effect only when `create_subword_instances` is False") - parser.add_argument('--only_with_annotations', action='store_true', default=False, help="") - parser.add_argument('--do_extraction', action='store_true', default=False, help="") - parser.add_argument('--visualize', action='store_true', default=False, help="") - parser.add_argument('--track_offsets', action='store_true', default=False, help="") - - - args = parser.parse_args() + + args = DatasetCreatorArguments().parse() + + if args.recompute_l2g: + args.do_extraction = True logging.basicConfig(level=logging.INFO, format="%(asctime)s:%(levelname)s:%(message)s") - dataset = DatasetCreator(args.indexed_environments, args.language, - args.bpe_tokenizer, args.create_subword_instances, - args.connect_subwords, args.only_with_annotations, - args.do_extraction, args.visualize, args.track_offsets) + dataset = DatasetCreator( + args.indexed_environments, args.language, args.bpe_tokenizer, args.create_subword_instances, + args.connect_subwords, args.only_with_annotations, args.do_extraction, args.visualize, args.track_offsets, + args.remove_type_annotations, args.recompute_l2g + ) dataset.merge(args.output_directory) diff --git a/SourceCodeTools/code/data/sourcetrail/annotations_random_split.py b/SourceCodeTools/code/data/sourcetrail/annotations_random_split.py index a9fae023..60701586 100644 --- a/SourceCodeTools/code/data/sourcetrail/annotations_random_split.py +++ b/SourceCodeTools/code/data/sourcetrail/annotations_random_split.py @@ -1,4 +1,4 @@ -from SourceCodeTools.code.data.sourcetrail.Dataset import split, ensure_connectedness, ensure_valid_edges +from SourceCodeTools.code.data.dataset.Dataset import SourceGraphDataset # split, ensure_connectedness, ensure_valid_edges import pandas as pd import sys from os.path import join, dirname @@ -16,9 +16,9 @@ nodes['id'] = pd.concat([type_ann['src'], type_ann['dst']], axis=0).unique() -nodes, train = ensure_connectedness(nodes, train) +nodes, train = SourceGraphDataset.ensure_connectedness(nodes, train) -nodes, test = ensure_valid_edges(nodes, test, ignore_src=True) +nodes, test = SourceGraphDataset.ensure_valid_edges(nodes, test, ignore_src=True) train.rename({ "dst":"source_node_id", diff --git a/SourceCodeTools/code/data/sourcetrail/assess_type_pred.neighbours.py b/SourceCodeTools/code/data/sourcetrail/assess_type_pred.neighbours.py new file mode 100644 index 00000000..8a529f1e --- /dev/null +++ b/SourceCodeTools/code/data/sourcetrail/assess_type_pred.neighbours.py @@ -0,0 +1,57 @@ +import pickle + +import sys + +import numpy as np + +from SourceCodeTools.code.data.file_utils import unpersist + +type_annotations_path = sys.argv[1] +nodes = sys.argv[2] +embeddings_path = sys.argv[3] + +nodes = unpersist(nodes) +type_annotations = unpersist(type_annotations_path) +embs = pickle.load(open(embeddings_path, "rb"))# [0] + +def normalize(typeann): + return typeann.strip("\"").strip("'").split("[")[0] + +node2nodetype = dict(zip(nodes["id"], nodes["type"])) + +type_annotations["dst"] = type_annotations["dst"].apply(normalize) + +type_annotations = type_annotations[ + type_annotations["src"].apply(lambda x: node2nodetype[x] == "mention") +] + +seed_nodes = type_annotations["src"] +node2type = dict(zip(type_annotations["src"], type_annotations["dst"])) + +seed_embs = [] +missing_emb = [] +for nid in seed_nodes: + if nid in embs: + seed_embs.append(embs[nid]) + else: + missing_emb.append(nid) + +type_embs = np.vstack(seed_embs) + +added = set() +same_or_not = dict() + +for nid, emb in zip(seed_nodes, type_embs): + diff = emb.reshape(1, -1) - type_embs + diff2 = np.square(diff) + dist = np.sqrt(np.sum(diff2, axis=1)) + order = np.argsort(dist) + min_pos = order[1] + closest = seed_nodes.iloc[min_pos] + if (nid, closest) not in added and (closest, nid) not in added: + added.add((nid, closest)) + added.add((closest, nid)) + same_or_not[(nid, closest)] = node2type[nid] == node2type[closest] + +print(sum(list(same_or_not.values())) / len(same_or_not)) + diff --git a/SourceCodeTools/code/data/sourcetrail/common.py b/SourceCodeTools/code/data/sourcetrail/common.py index 8cc28b4c..e8635e2d 100644 --- a/SourceCodeTools/code/data/sourcetrail/common.py +++ b/SourceCodeTools/code/data/sourcetrail/common.py @@ -1,31 +1,21 @@ -from SourceCodeTools.code.data.sourcetrail.file_utils import * +import hashlib + +from SourceCodeTools.code.data.file_utils import * from tqdm import tqdm DEFINITION_TYPE = 1 UNRESOLVED_SYMBOL = "unsolved_symbol" -import sqlite3 - - -class SQLTable: - def __init__(self, df, filename, table_name): - self.conn = sqlite3.connect(filename) - self.path = filename - self.table_name = table_name - - df.to_sql(self.table_name, con=self.conn, if_exists='replace', index=False, index_label=df.columns) - - def query(self, query_string): - return pd.read_sql(query_string, self.conn) - - def __del__(self): - self.conn.close() - if os.path.isfile(self.path): - os.remove(self.path) - - def get_occurrence_groups(nodes, edges, source_location, occurrence): + """ + Group nodes based on file id. Return dataset that contains node ids and their offsets in the source code. + :param nodes: dataframe with nodes + :param edges: dataframe with edges + :param source_location: dataframe with sources + :param occurrence: dataframe with with offsets + :return: Result of group by file id + """ edges = edges.rename(columns={'type': 'e_type'}) edges = edges.query("id >= 0") # filter reverse edges @@ -70,53 +60,3 @@ def sql_get_occurrences_from_range(occurrences, start, end) -> pd.DataFrame: f"select * from {occurrences.table_name} where start_line >= {start} and end_line <= {end} and occ_type != {DEFINITION_TYPE} and start_line = end_line") df = df.astype({"source_node_id": "Int32", "target_node_id": "Int32"}) return df - - -def create_node_repr(nodes): - return list(zip(nodes['serialized_name'], nodes['type'])) - - -def map_id_columns(df, column_names, mapper): - df = df.copy() - for col in column_names: - if col in df.columns: - df[col] = df[col].apply(lambda x: mapper.get(x, pd.NA)) - return df - - -def map_offsets(column, id_map): - def map_entry(entry): - return [(e[0], e[1], id_map[e[2]]) for e in entry] - return [map_entry(entry) for entry in column] - - -def merge_with_file_if_exists(df, merge_with_file): - if os.path.isfile(merge_with_file): - original_data = unpersist(merge_with_file) - data = pd.concat([original_data, df], axis=0) - else: - data = df - return data - - -def create_local_to_global_id_map(local_nodes, global_nodes): - local_nodes = local_nodes.copy() - global_nodes = global_nodes.copy() - - global_nodes['node_repr'] = create_node_repr(global_nodes) - local_nodes['node_repr'] = create_node_repr(local_nodes) - - rev_id_map = dict(zip( - global_nodes['node_repr'].tolist(), global_nodes['id'].tolist() - )) - id_map = dict(zip( - local_nodes["id"].tolist(), map( - lambda x: rev_id_map[x], local_nodes["node_repr"].tolist() - ) - )) - - return id_map - - -def custom_tqdm(iterable, total, message): - return tqdm(iterable, total=total, desc=message, leave=False, dynamic_ncols=True) \ No newline at end of file diff --git a/SourceCodeTools/code/data/sourcetrail/deprecated/DatasetCreator.py b/SourceCodeTools/code/data/sourcetrail/deprecated/DatasetCreator.py index 82a3929c..073953e8 100644 --- a/SourceCodeTools/code/data/sourcetrail/deprecated/DatasetCreator.py +++ b/SourceCodeTools/code/data/sourcetrail/deprecated/DatasetCreator.py @@ -14,7 +14,7 @@ from SourceCodeTools.code.data.sourcetrail.sourcetrail_extract_variable_names import extract_var_names from SourceCodeTools.code.data.sourcetrail.sourcetrail_extract_node_names import extract_node_names -from SourceCodeTools.code.data.sourcetrail.file_utils import * +from SourceCodeTools.code.data.file_utils import * class DatasetCreator: diff --git a/SourceCodeTools/code/data/sourcetrail/deprecated/sourcetrail_extract_type_information.py b/SourceCodeTools/code/data/sourcetrail/deprecated/sourcetrail_extract_type_information.py deleted file mode 100644 index 590f4be9..00000000 --- a/SourceCodeTools/code/data/sourcetrail/deprecated/sourcetrail_extract_type_information.py +++ /dev/null @@ -1,18 +0,0 @@ -import sys - -from SourceCodeTools.code.data.sourcetrail.file_utils import * - - -def main(): - edges = unpersist(sys.argv[1]) - out_annotations = sys.argv[2] - out_no_annotations = sys.argv[3] - - annotations = edges.query(f"type == 'annotation_for' or type == 'returned_by'") - no_annotations = edges.query(f"type != 'annotation_for' and type != 'returned_by'") - - persist(annotations, out_annotations) - persist(no_annotations, out_no_annotations) - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/SourceCodeTools/code/data/sourcetrail/deprecated/verify_edges.py b/SourceCodeTools/code/data/sourcetrail/deprecated/verify_edges.py index 9523a025..c8e87b17 100644 --- a/SourceCodeTools/code/data/sourcetrail/deprecated/verify_edges.py +++ b/SourceCodeTools/code/data/sourcetrail/deprecated/verify_edges.py @@ -1,4 +1,4 @@ -from SourceCodeTools.code.data.sourcetrail.Dataset import load_data +from SourceCodeTools.code.data.dataset.Dataset import load_data import sys node_path = sys.argv[1] diff --git a/SourceCodeTools/code/data/sourcetrail/extra_reverse_finder.py b/SourceCodeTools/code/data/sourcetrail/extra_reverse_finder.py index a60fa359..297994dd 100644 --- a/SourceCodeTools/code/data/sourcetrail/extra_reverse_finder.py +++ b/SourceCodeTools/code/data/sourcetrail/extra_reverse_finder.py @@ -1,7 +1,7 @@ import os import sys -from SourceCodeTools.code.data.sourcetrail.file_utils import unpersist +from SourceCodeTools.code.data.file_utils import unpersist def main(): diff --git a/SourceCodeTools/code/data/sourcetrail/get_embeddings_with_string.py b/SourceCodeTools/code/data/sourcetrail/get_embeddings_with_string.py new file mode 100644 index 00000000..9eb532eb --- /dev/null +++ b/SourceCodeTools/code/data/sourcetrail/get_embeddings_with_string.py @@ -0,0 +1,51 @@ +import pickle +import sys + +import pandas as pd + +from SourceCodeTools.code.data.file_utils import unpersist + +# get tb embedding where labels are human-readable strings + + +def main(): + nodes_with_strings = sys.argv[1] + embeddings = sys.argv[2] + out_name_for_tb = sys.argv[3] + + all_embs = pickle.load(open(embeddings, "rb"))[0] + nodes = unpersist(nodes_with_strings) + + embs = [] + strings = [] + + tb_meta_path = out_name_for_tb + "meta.tsv" + tb_embs_path = out_name_for_tb + "embs.tsv" + + nodes.dropna(inplace=True) + nodes = nodes.sample(n=5000) + + with open(tb_meta_path, "w") as tb_meta: + with open(tb_embs_path, "w") as tb_embs: + tb_meta.write("string\ttype\n") + for id, string, type in zip(nodes["id"], nodes["string"], nodes["type"]): + if string is not None and not pd.isna(string) and "srctrl" not in string and "\n" not in string: + # embs.append( + # all_embs[id] + # ) + # strings.append(string) + sep = "\t" + + try: + string = f"{string}\t{type}\n" #f"{string.encode('utf-8')}\n" + emb_string = f"{sep.join(str(e) for e in all_embs[int(id)])}\n" + tb_meta.write(string) + tb_embs.write(emb_string) + except: + pass + + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/SourceCodeTools/code/data/sourcetrail/k_hop_graph.py b/SourceCodeTools/code/data/sourcetrail/k_hop_graph.py new file mode 100644 index 00000000..b821aec4 --- /dev/null +++ b/SourceCodeTools/code/data/sourcetrail/k_hop_graph.py @@ -0,0 +1,66 @@ +from os.path import join + +import networkx as nx +from tqdm import tqdm + +from SourceCodeTools.code.data.dataset.Dataset import load_data + + +def main(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("working_directory") + parser.add_argument("k_hops", type=int) + parser.add_argument("output") + + args = parser.parse_args() + + nodes, edges = load_data( + join(args.working_directory, "common_nodes.bz2"), join(args.working_directory, "common_edges.bz2") + ) + + edge_types = {} + edge_lists = {} + for s, d, t in edges[["src", "dst", "type"]].values: + edge_types[(s,d)] = t + if s not in edge_lists: + edge_lists[s] = [] + edge_lists[s].append(d) + + g = nx.from_pandas_edgelist( + edges, source="src", target="dst", create_using=nx.DiGraph, edge_attr="type" + ) + + # def expand_edges(node_id, view, edge_prefix, level=0): + # edges = [] + # if level <= args.k_hops: + # if edge_prefix != "": + # edge_prefix += "|" + # for e in view: + # edges.append((node_id, e, edge_prefix + view[e]["type"])) + # edges.extend(expand_edges(node_id, g[e], edge_prefix + view[e]["type"], level=level+1)) + # return edges + # + # edges = [] + # for node in tqdm(g.nodes): + # edges.extend(expand_edges(node, g[node], "", level=0)) + + def expand_edges(node_id, s, dlist, edge_prefix, level=0): + edges = [] + if level <= args.k_hops: + if edge_prefix != "": + edge_prefix += "|" + for d in dlist: + etype = edge_prefix + edge_types[(s,d)] + edges.append((node_id, d, etype)) + edges.extend(expand_edges(node_id, d, edge_lists[d], etype, level=level+1)) + return edges + + edges = [] + for node in tqdm(edge_lists): + edges.extend(expand_edges(node, node, edge_lists[node], "", level=0)) + + print() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/SourceCodeTools/code/data/sourcetrail/nodes_of_interest_from_dataset.py b/SourceCodeTools/code/data/sourcetrail/nodes_of_interest_from_dataset.py new file mode 100644 index 00000000..d9e69bcc --- /dev/null +++ b/SourceCodeTools/code/data/sourcetrail/nodes_of_interest_from_dataset.py @@ -0,0 +1,29 @@ +import json + + +def get_node_ids_from_dataset(dataset_path): + node_ids = [] + with open(dataset_path, "r") as dataset: + for line in dataset: + entry = json.loads(line) + for _, _, id_ in entry["replacements"]: + node_ids.append(int(id_)) + return node_ids + + +def main(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("dataset") + parser.add_argument("output") + args = parser.parse_args() + + node_ids = get_node_ids_from_dataset(args.dataset) + with open(args.output, "w") as sink: + sink.write("node_id\n") + for id_ in node_ids: + sink.write(f"{id_}\n") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/SourceCodeTools/code/data/sourcetrail/pandas_format_converter.py b/SourceCodeTools/code/data/sourcetrail/pandas_format_converter.py index ecf6fb01..450e1f7c 100644 --- a/SourceCodeTools/code/data/sourcetrail/pandas_format_converter.py +++ b/SourceCodeTools/code/data/sourcetrail/pandas_format_converter.py @@ -1,5 +1,5 @@ import sys -from SourceCodeTools.code.data.sourcetrail.file_utils import * +from SourceCodeTools.code.data.file_utils import * input_path = sys.argv[1] target_format = sys.argv[2] diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_add_reverse_edges.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_add_reverse_edges.py index a27850c7..42d9cc5b 100644 --- a/SourceCodeTools/code/data/sourcetrail/sourcetrail_add_reverse_edges.py +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_add_reverse_edges.py @@ -1,4 +1,4 @@ -from SourceCodeTools.code.data.sourcetrail.file_utils import * +from SourceCodeTools.code.data.file_utils import * import pandas as p import sys diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_analyze_tree_depth.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_analyze_tree_depth.py new file mode 100644 index 00000000..cb1f96c6 --- /dev/null +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_analyze_tree_depth.py @@ -0,0 +1,46 @@ +import ast +import os +from typing import Iterable + +from SourceCodeTools.code.data.file_utils import unpersist +import numpy as np + +class DepthEstimator: + def __init__(self): + self.depth = 0 + + def go(self, node, depth=0): + depth += 1 + if depth > self.depth: + self.depth = depth + if isinstance(node, Iterable) and not isinstance(node, str): + for subnode in node: + self.go(subnode, depth=depth) + if hasattr(node, "_fields"): + for field in node._fields: + self.go(getattr(node, field), depth=depth) + +def main(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("bodies") + args = parser.parse_args() + + bodies = unpersist(args.bodies) + + depths = [] + + for ind, row in bodies.iterrows(): + body = row.body + body_ast = ast.parse(body.strip()) + de = DepthEstimator() + de.go(body_ast) + depths.append(de.depth) + + print(f"Average depth: {sum(depths)/len(depths)}") + depths = np.array(depths, dtype=np.int32) + np.savetxt(os.path.join(os.path.dirname(args.bodies), "bodies_depths.txt"), depths, "%d") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_ast_edges.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_ast_edges.py index 14cf586d..e98dac59 100644 --- a/SourceCodeTools/code/data/sourcetrail/sourcetrail_ast_edges.py +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_ast_edges.py @@ -4,13 +4,11 @@ from nltk import RegexpTokenizer -from SourceCodeTools.code.python_ast import AstGraphGenerator -from SourceCodeTools.code.python_ast import GNode -from SourceCodeTools.nlp.entity.annotator.annotator_utils import to_offsets, overlap, resolve_self_collision -from SourceCodeTools.code.data.sourcetrail.file_utils import * +from SourceCodeTools.code.ast.python_ast import AstGraphGenerator, GNode, PythonSharedNodes +from SourceCodeTools.code.annotator_utils import to_offsets, overlap, resolve_self_collision +from SourceCodeTools.code.data.file_utils import * from SourceCodeTools.nlp.embed.bpe import load_bpe_model, make_tokenizer -from SourceCodeTools.code.data.sourcetrail.common import custom_tqdm -from SourceCodeTools.code.python_ast import PythonSharedNodes +from SourceCodeTools.code.common import custom_tqdm pd.options.mode.chained_assignment = None @@ -219,7 +217,7 @@ def get_ast_nodes(edges): return nodes -from SourceCodeTools.nlp.entity.annotator.annotator_utils import adjust_offsets +from SourceCodeTools.code.annotator_utils import adjust_offsets def format_replacement_offsets(offsets): @@ -423,6 +421,9 @@ def make_reverse_edge(edge): rev_edge['type'] = edge['type'] + "_rev" rev_edge['src'] = edge['dst'] rev_edge['dst'] = edge['src'] + rev_edge['offsets'] = None + if "scope" in edge: + rev_edge["scope"] = edge["scope"] return rev_edge diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_ast_edges2.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_ast_edges2.py index d852163b..8ee8d578 100644 --- a/SourceCodeTools/code/data/sourcetrail/sourcetrail_ast_edges2.py +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_ast_edges2.py @@ -1,19 +1,20 @@ import argparse import re from copy import copy -from time import time_ns import networkx as nx +from SourceCodeTools.code.IdentifierPool import IntIdentifierPool +from SourceCodeTools.code.ast import has_valid_syntax +from SourceCodeTools.code.common import custom_tqdm from SourceCodeTools.code.data.sourcetrail.common import * -from SourceCodeTools.code.data.sourcetrail.sourcetrail_add_reverse_edges import add_reverse_edges from SourceCodeTools.code.data.sourcetrail.sourcetrail_ast_edges import NodeResolver, make_reverse_edge -from SourceCodeTools.code.python_ast import AstGraphGenerator, GNode, PythonSharedNodes -from SourceCodeTools.nlp.entity.annotator.annotator_utils import adjust_offsets2 -from SourceCodeTools.nlp.entity.annotator.annotator_utils import overlap as range_overlap -from SourceCodeTools.nlp.entity.annotator.annotator_utils import to_offsets, get_cum_lens +from SourceCodeTools.code.ast.python_ast2 import AstGraphGenerator, GNode, PythonSharedNodes +# from SourceCodeTools.code.python_ast_cf import AstGraphGenerator +from SourceCodeTools.code.annotator_utils import adjust_offsets2 +from SourceCodeTools.code.annotator_utils import overlap as range_overlap +from SourceCodeTools.code.annotator_utils import to_offsets, get_cum_lens from SourceCodeTools.nlp.string_tools import get_byte_to_char_map -from SourceCodeTools.code.data.sourcetrail.sourcetrail_parse_bodies2 import has_valid_syntax class MentionTokenizer: @@ -27,13 +28,18 @@ def __init__(self, bpe_tokenizer_path, create_subword_instances, connect_subword self.connect_subwords = connect_subwords def replace_mentions_with_subwords(self, edges): + """ + Process edges and tokenize certain node types + :param edges: List of edges + :return: List of edges, including new edges for subword tokenization + """ if self.create_subword_instances: - def produce_subw_edges(subwords, dst): - return self.produce_subword_edges_with_instances(subwords, dst) + def produce_subw_edges(subwords, dst, scope=None): + return self.produce_subword_edges_with_instances(subwords, dst, self.connect_subwords, scope=scope) else: - def produce_subw_edges(subwords, dst): - return self.produce_subword_edges(subwords, dst, self.connect_subwords) + def produce_subw_edges(subwords, dst, scope=None): + return self.produce_subword_edges(subwords, dst, self.connect_subwords, scope=scope) new_edges = [] for edge in edges: @@ -56,13 +62,13 @@ def produce_subw_edges(subwords, dst): else: subwords = self.bpe(edge['src'].name) - new_edges.extend(produce_subw_edges(subwords, dst)) + new_edges.extend(produce_subw_edges(subwords, dst, edge["scope"] if "scope" in edge else None)) else: new_edges.append(edge) elif self.bpe is not None and edge["type"] == "__global_name": subwords = self.bpe(edge['src'].name) - new_edges.extend(produce_subw_edges(subwords, edge['dst'])) + new_edges.extend(produce_subw_edges(subwords, edge['dst'], edge["scope"] if "scope" in edge else None)) elif self.bpe is not None and edge['src'].type in PythonSharedNodes.tokenizable_types_and_annotations: new_edges.append(edge) if edge['type'] != "global_mention_rev": @@ -70,7 +76,7 @@ def produce_subw_edges(subwords, dst): dst = edge['src'] subwords = self.bpe(dst.name) - new_edges.extend(produce_subw_edges(subwords, dst)) + new_edges.extend(produce_subw_edges(subwords, dst, edge["scope"] if "scope" in edge else None)) # elif self.bpe is not None and edge['dst'].type in {"Global"} and edge['src'].type != "Constant": # # this brach is disabled because it does not seem to make sense # # Globals can be referred by Name nodes, but they are already processed in the branch above @@ -79,7 +85,7 @@ def produce_subw_edges(subwords, dst): # # dst = edge['src'] # subwords = self.bpe(dst.name) - # new_edges.extend(produce_subw_edges(subwords, dst)) + # new_edges.extend(produce_subw_edges(subwords, dst, edge["scope"] if "scope" in edge else None)) else: new_edges.append(edge) @@ -105,7 +111,7 @@ def produce_subw_edges(subwords, dst): # return global_edges @staticmethod - def connect_prev_next_subwords(edges, current, prev_subw, next_subw): + def connect_prev_next_subwords(edges, current, prev_subw, next_subw, scope=None): if next_subw is not None: edges.append({ 'src': current, @@ -113,6 +119,8 @@ def connect_prev_next_subwords(edges, current, prev_subw, next_subw): 'type': 'next_subword', 'offsets': None }) + if scope is not None: + edges[-1]["scope"] = scope if prev_subw is not None: edges.append({ 'src': current, @@ -120,8 +128,10 @@ def connect_prev_next_subwords(edges, current, prev_subw, next_subw): 'type': 'prev_subword', 'offsets': None }) + if scope is not None: + edges[-1]["scope"] = scope - def produce_subword_edges(self, subwords, dst, connect_subwords=False): + def produce_subword_edges(self, subwords, dst, connect_subwords=False, scope=None): new_edges = [] subwords = list(map(lambda x: GNode(name=x, type="subword"), subwords)) @@ -132,12 +142,14 @@ def produce_subword_edges(self, subwords, dst, connect_subwords=False): 'type': 'subword', 'offsets': None }) + if scope is not None: + new_edges[-1]["scope"] = scope if connect_subwords: self.connect_prev_next_subwords(new_edges, subword, subwords[ind - 1] if ind > 0 else None, - subwords[ind + 1] if ind < len(subwords) - 1 else None) + subwords[ind + 1] if ind < len(subwords) - 1 else None, scope=scope) return new_edges - def produce_subword_edges_with_instances(self, subwords, dst, connect_subwords=True): + def produce_subword_edges_with_instances(self, subwords, dst, connect_subwords=True, scope=None): new_edges = [] subwords = list(map(lambda x: GNode(name=x, type="subword"), subwords)) @@ -150,15 +162,19 @@ def produce_subword_edges_with_instances(self, subwords, dst, connect_subwords=T 'type': 'subword_instance', 'offsets': None }) + if scope is not None: + new_edges[-1]["scope"] = scope new_edges.append({ 'src': subword_instance, 'dst': dst, 'type': 'subword', 'offsets': None }) + if scope is not None: + new_edges[-1]["scope"] = scope if connect_subwords: - self.connect_prev_next_subwords(new_edges, subword, subwords[ind - 1] if ind > 0 else None, - subwords[ind + 1] if ind < len(subwords) - 1 else None) + self.connect_prev_next_subwords(new_edges, subword_instance, instances[ind - 1] if ind > 0 else None, + instances[ind + 1] if ind < len(instances) - 1 else None, scope=scope) return new_edges @@ -166,7 +182,7 @@ class GlobalNodeMatcher: def __init__(self, nodes, edges): from SourceCodeTools.code.data.sourcetrail.sourcetrail_types import node_types # filter function, classes, methods, and modules - self.allowed_node_types = set([node_types[tt] for tt in [8, 128, 4096, 8192]]) + self.allowed_node_types = set(node_types.values()) # set([node_types[tt] for tt in [8, 128, 2048, 4096, 8192]]) self.allowed_edge_types = {"defined_in"} #{"defines_rev"} self.allowed_ast_node_types = {"FunctionDef", "ClassDef", "Module", "mention"} @@ -181,6 +197,8 @@ def __init__(self, nodes, edges): self.global_graph = nx.DiGraph() self.global_graph.add_edges_from(self.get_edges_for_graph(self.edges)) + self.name_mapping = dict() + @staticmethod def get_edges_for_graph(edges): try: @@ -201,12 +219,21 @@ def get_node_types(nodes): return id2type def match_with_global_nodes(self, nodes, edges): + """ + Match AST nodes that represent functions, classes. and modules with nodes from global graph. + This information is not stored explicitly and need to walk the graph to resolve. + :param nodes: + :param edges: + :return: Mapping from AST nodes to corresponding global nodes + """ nodes = pd.DataFrame([node for node in nodes if node["type"] in self.allowed_ast_node_types]) edges = pd.DataFrame([edge for edge in edges if edge["type"] in self.allowed_ast_edge_types]) if len(edges) == 0: return {} + local_names = dict(zip(nodes["id"], nodes["serialized_name"])) + func_nodes = nodes.query("type == 'FunctionDef'")["id"].values class_nodes = nodes.query("type == 'ClassDef'")["id"].values module_nodes = nodes.query("type == 'Module'")["id"].values @@ -216,8 +243,8 @@ def match_with_global_nodes(self, nodes, edges): def find_global_id(graph, def_id, motif): """ - Do function or class global id lookup - :param graph: + Perform function or class global id lookup. Need this because + :param graph: nx graph :param def_id: :param motif: list with edge types, return None if path does not exist :return: @@ -225,6 +252,8 @@ def find_global_id(graph, def_id, motif): name_node = def_id motif = copy(motif) while len(motif) > 0: + if name_node not in graph: + return None link_type = motif.pop(0) for node, eprop in graph[name_node].items(): if eprop["type"] == link_type: @@ -297,7 +326,9 @@ def get_global_module_id2(func_global, class_global): module_global = get_global_module_id(module_nodes, module_candidates, func_global, class_global) new_node_ids.update(module_global) - return new_node_ids + name_mapping = {local_names[key]: self.global_names[val] for key, val in new_node_ids.items()} + + return new_node_ids, name_mapping @staticmethod def merge_global_references(global_references, module_global_references): @@ -309,36 +340,47 @@ def merge_global_references(global_references, module_global_references): class ReplacementNodeResolver(NodeResolver): - def __init__(self, nodes): - - self.nodeid2name = dict(zip(nodes['id'].tolist(), nodes['serialized_name'].tolist())) - self.nodeid2type = dict(zip(nodes['id'].tolist(), nodes['type'].tolist())) + def __init__(self, nodes=None): + + if nodes is not None: + self.nodeid2name = dict(zip(nodes['id'].tolist(), nodes['serialized_name'].tolist())) + self.nodeid2type = dict(zip(nodes['id'].tolist(), nodes['type'].tolist())) + self.valid_new_node = nodes['id'].max() + 1 + self.old_nodes = nodes.copy() + self.old_nodes['mentioned_in'] = pd.NA + self.old_nodes['string'] = pd.NA + self.old_nodes = self.old_nodes.astype({'mentioned_in': 'Int32'}) + self.old_nodes = self.old_nodes.astype({'string': 'string'}) + else: + self.nodeid2name = dict() + self.nodeid2type = dict() + self.valid_new_node = 0 + self.old_nodes = pd.DataFrame() - self.valid_new_node = nodes['id'].max() + 1 self.node_ids = {} self.new_nodes = [] self.stashed_nodes = [] - self.old_nodes = nodes.copy() - self.old_nodes['mentioned_in'] = pd.NA - self.old_nodes = self.old_nodes.astype({'mentioned_in': 'Int32'}) + self._resolver_cache = dict() def stash_new_nodes(self): + """ + Put new nodes into temporary storage. + :return: Nothing + """ self.stashed_nodes.extend(self.new_nodes) self.new_nodes = [] - def resolve_substrings(self, node, replacement2srctrl): - - decorated = "@" in node.name - assert not decorated - - name_ = copy(node.name) + def recover_original_string(self, name_, replacement2srctrl): + if name_ in self._resolver_cache: + return self._resolver_cache[name_] replacements = dict() global_node_id = [] global_name = [] global_type = [] - for name in re.finditer("srctrlrpl_[0-9]+", name_): + + for name in re.finditer("srctrlrpl_[0-9]{19}", name_): if isinstance(name, re.Match): name = name.group() elif isinstance(name, str): @@ -364,8 +406,26 @@ def resolve_substrings(self, node, replacement2srctrl): for r, v in replacements.items(): real_name = real_name.replace(r, v["name"]) + self._resolver_cache[name_] = (real_name, global_name, global_node_id, global_type) + return real_name, global_name, global_node_id, global_type + + def resolve_substrings(self, node, replacement2srctrl): + + decorated = "@" in node.name + + name_ = copy(node.name) + if decorated: + name_, decorator = name_.split("@") + # assert not decorated + + real_name, global_name, global_node_id, global_type = self.recover_original_string(name_, replacement2srctrl) + + if decorated: + real_name = real_name + "@" + decorator + return GNode( - name=real_name, type=node.type, global_name=global_name, global_id=global_node_id, global_type=global_type + name=real_name, type=node.type, global_name=global_name, global_id=global_node_id, global_type=global_type, + string=node.string ) def resolve_regular_replacement(self, node, replacement2srctrl): @@ -408,28 +468,52 @@ def resolve_regular_replacement(self, node, replacement2srctrl): type_ = node.type if hasattr(node, "scope"): new_node = GNode(name=real_name, type=type_, global_name=global_name, global_id=global_node_id, - global_type=global_type, scope=node.scope) + global_type=global_type, scope=node.scope, string=node.string) else: new_node = GNode(name=real_name, type=type_, global_name=global_name, global_id=global_node_id, - global_type=global_type) + global_type=global_type, string=node.string) else: new_node = node return new_node def resolve(self, node, replacement2srctrl): + """ + :param node: string that represents node + :param replacement2srctrl: dictionary of {sourcetrail_node: original name} + :return: resolved string with original node name + """ + # this function is defined in this class instead of SourceTrail resolver because it needs access to + # global node ids if node.type == "type_annotation": new_node = self.resolve_substrings(node, replacement2srctrl) + if "srctrlrpl_" in new_node.name: + new_node.name = "Any" else: new_node = self.resolve_regular_replacement(node, replacement2srctrl) - if "srctrlrpl_" in new_node.name: # hack to process imports + if "srctrlrpl_" in new_node.name: # hack to process imports TODO maybe need to use while to resolve everything? new_node = self.resolve_substrings(node, replacement2srctrl) + if "srctrlrpl_" in new_node.name: # if still failed to resolve + new_node.name = "unresolved_name" + + if node.string is not None: + string,_,_,_ = self.recover_original_string(node.string, replacement2srctrl) + node.string = string - assert "srctrlrpl_" not in new_node.name + if hasattr(node, "scope"): + node.scope = self.resolve(node.scope, replacement2srctrl) + + # assert "srctrlrpl_" not in new_node.name return new_node def resolve_node_id(self, node, **kwargs): + """ + Resolve node id from name and type, create new node is no nodes like that found. + :param node: node + :param kwargs: + :return: updated node (return object with the save reference as input) + """ if not hasattr(node, "id"): node_repr = (node.name.strip(), node.type.strip()) @@ -440,34 +524,42 @@ def resolve_node_id(self, node, **kwargs): new_id = self.get_new_node_id() self.node_ids[node_repr] = new_id - if not PythonSharedNodes.is_shared(node): + if not PythonSharedNodes.is_shared(node) and not node.name == "unresolved_name": assert "0x" in node.name - self.new_nodes.append( - { - "id": new_id, - "type": node.type, - "serialized_name": node.name, - "mentioned_in": pd.NA - } - ) + if isinstance(node.string, str) and "srctrl" in node.string: + print(node.string) + + new_node = { + "id": new_id, + "type": node.type, + "serialized_name": node.name, + "mentioned_in": pd.NA, + "string": node.string + } + + self.new_nodes.append(new_node) if hasattr(node, "scope"): self.resolve_node_id(node.scope) - self.new_nodes[-1]["mentioned_in"] = node.scope.id + new_node["mentioned_in"] = node.scope.id node.setprop("id", new_id) return node def prepare_for_write(self, from_stashed=False): nodes = pd.concat([self.old_nodes, self.new_nodes_for_write(from_stashed)])[ - ['id', 'type', 'serialized_name', 'mentioned_in'] + ['id', 'type', 'serialized_name', 'mentioned_in', 'string'] ] return nodes def new_nodes_for_write(self, from_stashed=False): - new_nodes = pd.DataFrame(self.new_nodes if not from_stashed else self.stashed_nodes)[ - ['id', 'type', 'serialized_name', 'mentioned_in'] + new_nodes = pd.DataFrame(self.new_nodes if not from_stashed else self.stashed_nodes) + if len(new_nodes) == 0: + return None + + new_nodes = new_nodes[ + ['id', 'type', 'serialized_name', 'mentioned_in', 'string'] ].astype({"mentioned_in": "Int32"}) return new_nodes @@ -609,6 +701,11 @@ def into_offset(range): class SourcetrailResolver: + """ + Helper class to work with source code stored in Sourcetrail database. Implements functions of + - Iterating over files + - Preserving Sourcetrail nodes + """ def __init__(self, nodes, edges, source_location, occurrence, file_content, lang): self.nodes = nodes self.node2name = dict(zip(nodes['id'], nodes['serialized_name'])) @@ -622,6 +719,9 @@ def __init__(self, nodes, edges, source_location, occurrence, file_content, lang @property def occurrence_groups(self): + """ + :return: Iterator for occurrences grouped by file id. + """ if self._occurrence_groups is None: self._occurrence_groups = get_occurrence_groups(self.nodes, self.edges, self.source_location, self.occurrence) return self._occurrence_groups @@ -709,11 +809,18 @@ def add_names_from_edges(self, edges): def to_df(self): return pd.DataFrame(self.group2names) -def global_mention_edges_from_node(node): + +def global_mention_edges_from_node(node, scope=None): + """ + Construct a new edge that will link to a global node. + :param node: + :return: + """ global_edges = [] if type(node.global_id) is int: id_type = [(node.global_id, node.global_type)] else: + # in case there are many global references id_type = zip(node.global_id, node.global_type) for gid, gtype in id_type: @@ -723,19 +830,27 @@ def global_mention_edges_from_node(node): "type": "global_mention", "offsets": None } + if scope is not None: + global_mention["scope"] = scope global_edges.append(global_mention) global_edges.append(make_reverse_edge(global_mention)) return global_edges + def add_global_mentions(edges): + """ + :param edges: List of dictionaries that represent edges + :return: Original edges with additional edges for global references (e.g. global_mention edge + for function calls) + """ new_edges = [] for edge in edges: if edge['src'].type in {"#attr#", "Name"}: if hasattr(edge['src'], "global_id"): - new_edges.extend(global_mention_edges_from_node(edge['src'])) + new_edges.extend(global_mention_edges_from_node(edge['src'], scope=edge["scope"] if "scope" in edge else None)) elif edge['dst'].type == "mention": if hasattr(edge['dst'], "global_id"): - new_edges.extend(global_mention_edges_from_node(edge['dst'])) + new_edges.extend(global_mention_edges_from_node(edge['dst'], scope=edge["scope"] if "scope" in edge else None)) new_edges.append(edge) return new_edges @@ -777,6 +892,13 @@ def produce_nodes_without_name(global_nodes, ast_edges): def standardize_new_edges(edges, node_resolver, mention_tokenizer): + """ + Tokenize relevant node names, assign id to every node, collapse edge representation to id-based + :param edges: list of edges + :param node_resolver: helper class that tracks node ids + :param mention_tokenizer: helper class that performs tokenization of relevant nodes + :return: + """ edges = mention_tokenizer.replace_mentions_with_subwords(edges) @@ -801,22 +923,37 @@ def standardize_new_edges(edges, node_resolver, mention_tokenizer): return edges -def process_code(source_file_content, offsets, node_resolver, mention_tokenizer, node_matcher, named_group_tracker, track_offsets=False): +def process_code(source_file_content, offsets, node_resolver, mention_tokenizer, node_matcher, track_offsets=False): + """ + + :param source_file_content: String for a file from SOurcetrail database + :param offsets: List of global occurrences in this file + :param node_resolver: + :param mention_tokenizer: + :param node_matcher: + :param track_offsets: Flag that tells whether to perform offset tracking. + :return: Tuple that stores egdes, list of global and ast offsets for the current file in the format + (start, end, node_id, set(offsets for functions where the current offset occurs), + mapping from AST nodes to corresponding global nodes (will be used to replace one with the other). + """ # replace global occurrences with special tokens to help further parsing with ast package replacer = OccurrenceReplacer() replacer.perform_replacements(source_file_content, offsets) # compute ast edges - ast_processor = AstProcessor(replacer.source_with_replacements) + try: + ast_processor = AstProcessor(replacer.source_with_replacements) + except: + return None, None, None, None try: # TODO recursion error does not appear consistently. The issue is probably with library versions... edges = ast_processor.get_edges(as_dataframe=False) except RecursionError: - return None, None, None + return None, None, None, None if len(edges) == 0: - return None, None, None + return None, None, None, None - # resolve existing node names (primarily for subwords) + # resolve sourcetrail nodes ans replace with original resolve = lambda node: node_resolver.resolve(node, replacer.replacement_index) for edge in edges: @@ -828,23 +965,32 @@ def process_code(source_file_content, offsets, node_resolver, mention_tokenizer, # insert global mentions using replacements that were created on the first step edges = add_global_mentions(edges) - named_group_tracker.add_names_from_edges(edges) + # It seems I do not need this feature + # named_group_tracker.add_names_from_edges(edges) + # tokenize names, replace nodes with their ids edges = standardize_new_edges(edges, node_resolver, mention_tokenizer) if track_offsets: def get_valid_offsets(edges): + """ + :param edges: Dictionary that represents edge. Information is tored in edges but is related to source node + :return: Information about location of this edge (offset) in the source file in fromat (start, end, node_id) + """ return [(edge["offsets"][0], edge["offsets"][1], edge["src"]) for edge in edges if edge["offsets"] is not None] + # recover ast offsets for the current file ast_offsets = replacer.recover_offsets_with_edits2(get_valid_offsets(edges)) def merge_global_and_ast_offsets(ast_offsets, global_offsets, definitions): """ - Merge local and global offsets and add information about the scope - :param ast_offsets: - :param global_offsets: - :param definitions: - :return: + Merge local and global offsets and add information about the scope. Preserve the information that + indicates a function where the occurrence takes place. + :param ast_offsets: List of offsets in format (start, end, node_id) + :param global_offsets: List of offsets in format (start, end, node_id) + :param definitions: offsets for function declarations + :return: List of offsets in format [start, end, node_id, set(function_offset)]. Each offset can belong to + several functions in the case of nested functions. """ offsets = [[*offset, set()] for offset in ast_offsets] offsets = offsets + [[*offset, set()] for offset in global_offsets] @@ -864,23 +1010,40 @@ def merge_global_and_ast_offsets(ast_offsets, global_offsets, definitions): else: global_and_ast_offsets = None - ast_nodes_to_srctrl_nodes = node_matcher.match_with_global_nodes(node_resolver.new_nodes, edges) + # Get mapping from AST nodes to global nodes + ast_nodes_to_srctrl_nodes, ast_node_names_to_global_node_names = node_matcher.match_with_global_nodes(node_resolver.new_nodes, edges) - return edges, global_and_ast_offsets, ast_nodes_to_srctrl_nodes + return edges, global_and_ast_offsets, ast_nodes_to_srctrl_nodes, ast_node_names_to_global_node_names def get_ast_from_modules( nodes, edges, source_location, occurrence, file_content, bpe_tokenizer_path, create_subword_instances, connect_subwords, lang, track_offsets=False ): - + """ + Create edges from source code and methe them wit hthe global graph. Prepare all offsets in the uniform format. + :param nodes: DataFrame with nodes + :param edges: DataFrame with edges + :param source_location: Dataframe that links nodes to source files + :param occurrence: Dataframe with records about occurrences + :param file_content: Dataframe with sources + :param bpe_tokenizer_path: path to sentencepiece model + :param create_subword_instances: + :param connect_subwords: + :param lang: + :param track_offsets: + :return: Tuple: + - Dataframe with all nodes. Schema: id, type, name, mentioned_in (global and AST) + - Dataframe with all edges. Schema: id, type, src, dst, file_id, mentioned_in + - Dataframe with all_offsets. Schema: file_id, start, end, node_id, mentioned_in + """ srctrl_resolver = SourcetrailResolver(nodes, edges, source_location, occurrence, file_content, lang) node_resolver = ReplacementNodeResolver(nodes) node_matcher = GlobalNodeMatcher(nodes, edges) # add_reverse_edges(edges)) mention_tokenizer = MentionTokenizer(bpe_tokenizer_path, create_subword_instances, connect_subwords) - name_group_tracker = NameGroupTracker() all_ast_edges = [] all_global_references = {} + all_name_mappings = {} all_offsets = [] for group_ind, (file_id, occurrences) in custom_tqdm( @@ -897,8 +1060,8 @@ def get_ast_from_modules( # process code # try: - edges, global_and_ast_offsets, ast_nodes_to_srctrl_nodes = process_code( - source_file_content, offsets, node_resolver, mention_tokenizer, node_matcher, name_group_tracker, track_offsets=track_offsets + edges, global_and_ast_offsets, ast_nodes_to_srctrl_nodes, ast_node_names_to_global_node_names = process_code( + source_file_content, offsets, node_resolver, mention_tokenizer, node_matcher, track_offsets=track_offsets ) # except SyntaxError: # logging.warning(f"Error processing file_id {file_id}") @@ -916,8 +1079,15 @@ def get_ast_from_modules( all_ast_edges.extend(edges) node_matcher.merge_global_references(all_global_references, ast_nodes_to_srctrl_nodes) + all_name_mappings.update(ast_node_names_to_global_node_names) def format_offsets(global_and_ast_offsets, target): + """ + Format offset as a record and add to the common storage for offsets + :param global_and_ast_offsets: + :param target: List where all other offsets are stored. + :return: Nothing + """ if global_and_ast_offsets is not None: for offset in global_and_ast_offsets: target.append({ @@ -939,7 +1109,7 @@ def replace_ast_node_to_global(edges, mapping): if "scope" in edge: edge["scope"] = mapping.get(edge["scope"], edge["scope"]) - replace_ast_node_to_global(all_ast_edges, all_global_references) + # replace_ast_node_to_global(all_ast_edges, all_global_references) # disabled to keep global nodes separate def create_subwords_for_global_nodes(): all_ast_edges.extend( @@ -947,7 +1117,7 @@ def create_subwords_for_global_nodes(): node_resolver, mention_tokenizer)) node_resolver.stash_new_nodes() - create_subwords_for_global_nodes() + # create_subwords_for_global_nodes() # disabled to keep global nodes separate def prepare_new_nodes(node_resolver): @@ -960,11 +1130,32 @@ def prepare_new_nodes(node_resolver): }, from_stashed=True ) node_resolver.map_mentioned_in_to_global(all_global_references, from_stashed=True) + # TODO since some function definitions and modules are dropped, need to rename decorated nodes as well + # find old name in node_resolver.stashed nodes, go over all nodes and replace corresponding substrings in names node_resolver.drop_nodes(set(all_global_references.keys()), from_stashed=True) - prepare_new_nodes(node_resolver) + for offset in all_offsets: + offset["node_id"] = all_global_references.get(offset["node_id"], offset["node_id"]) + offset["mentioned_in"] = [(e[0], e[1], all_global_references.get(e[2], e[2])) for e in offset["mentioned_in"]] + + + # prepare_new_nodes(node_resolver) # disabled to keep global nodes separate all_ast_nodes = node_resolver.new_nodes_for_write(from_stashed=True) + if all_ast_nodes is None: + return None, None, None, None + + def decipher_node_name(name): + if name in all_name_mappings: + return all_name_mappings[name] + elif "@" in name: + for key, value in all_name_mappings.items(): + if key in name: + return name.replace(key, value) + return name + return name + + # all_ast_nodes["serialized_name"] = all_ast_nodes["serialized_name"].apply(decipher_node_name) def prepare_edges(all_ast_edges): all_ast_edges = pd.DataFrame(all_ast_edges) @@ -983,7 +1174,15 @@ def prepare_edges(all_ast_edges): else: all_offsets = None - return all_ast_nodes, all_ast_edges, all_offsets, name_group_tracker.to_df() + if len(all_name_mappings) > 0: + all_name_mappings = pd.DataFrame({ + "ast_name": list(all_name_mappings.keys()), + "proper_names": list(all_name_mappings.values()) + }) + else: + all_name_mappings = None + + return all_ast_nodes, all_ast_edges, all_offsets, all_name_mappings class OccurrenceReplacer: @@ -995,6 +1194,8 @@ def __init__(self): self.evicted = None self.edits = None + self._identifier_pool = IntIdentifierPool() + @staticmethod def format_offsets_for_replacements(offsets): offsets = offsets.sort_values(by=["start", "end"], ascending=[True, False]) @@ -1057,8 +1258,10 @@ def perform_replacements(self, source_file_content, offsets): offset = self.group_overlapping_offsets(offset, pending) # new_name = f"srctrlnd_{offset[2][0]}" - replacement_id = int(time_ns()) - new_name = "srctrlrpl_" + str(replacement_id) + # replacement_id = str(int(time_ns())) + replacement_id = self._identifier_pool.get_new_identifier() + assert len(replacement_id) == 19 + new_name = "srctrlrpl_" + replacement_id replacement_index[new_name] = { "srctrl_id": offset[2][0] if type(offset[2]) is not list else [o[0] for o in offset[2]], "original_string": src_str @@ -1190,7 +1393,7 @@ def recover_offsets_with_edits2(self, offsets): edges = read_edges(working_directory) file_content = read_filecontent(working_directory) - ast_nodes, ast_edges, offsets, ng = get_ast_from_modules(nodes, edges, source_location, occurrence, file_content, + ast_nodes, ast_edges, offsets = get_ast_from_modules(nodes, edges, source_location, occurrence, file_content, args.bpe_tokenizer, args.create_subword_instances, args.connect_subwords, args.lang) diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_call_seq_extractor.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_call_seq_extractor.py index 73f213a4..387272c2 100644 --- a/SourceCodeTools/code/data/sourcetrail/sourcetrail_call_seq_extractor.py +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_call_seq_extractor.py @@ -1,3 +1,4 @@ +from SourceCodeTools.code.common import custom_tqdm, SQLTable from SourceCodeTools.code.data.sourcetrail.common import * import sys @@ -31,7 +32,7 @@ def extract_call_seq(nodes, edges, source_location, occurrence): enumerate(occurrence_groups), message="Extracting call sequences", total=len(occurrence_groups) ): - sql_occurrences = SQLTable(occurrences, "/tmp/sourcetrail_occurrences.db", "occurrences") + sql_occurrences = SQLTable(occurrences, ":memory:", "occurrences") function_definitions = sql_get_function_definitions(sql_occurrences) diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_compute_function_diameter.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_compute_function_diameter.py index 7c97bf49..3fbc9d32 100644 --- a/SourceCodeTools/code/data/sourcetrail/sourcetrail_compute_function_diameter.py +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_compute_function_diameter.py @@ -1,5 +1,5 @@ -from SourceCodeTools.code.data.sourcetrail.file_utils import * -from SourceCodeTools.code.data.sourcetrail.common import custom_tqdm +from SourceCodeTools.code.data.file_utils import * +from SourceCodeTools.code.common import custom_tqdm import networkx as nx from collections import Counter diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_connected_component.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_connected_component.py index 84828dc8..ad0876e4 100644 --- a/SourceCodeTools/code/data/sourcetrail/sourcetrail_connected_component.py +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_connected_component.py @@ -1,7 +1,7 @@ import networkx as nx import sys -from SourceCodeTools.code.data.sourcetrail.file_utils import * +from SourceCodeTools.code.data.file_utils import * def main(): diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_decode_edge_types.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_decode_edge_types.py index 34f2ba14..51369150 100644 --- a/SourceCodeTools/code/data/sourcetrail/sourcetrail_decode_edge_types.py +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_decode_edge_types.py @@ -1,5 +1,5 @@ from SourceCodeTools.code.data.sourcetrail.sourcetrail_types import edge_types -from SourceCodeTools.code.data.sourcetrail.file_utils import * +from SourceCodeTools.code.data.file_utils import * import sys import os diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_draw_graph.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_draw_graph.py index 527309fe..f7793e2d 100644 --- a/SourceCodeTools/code/data/sourcetrail/sourcetrail_draw_graph.py +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_draw_graph.py @@ -1,16 +1,14 @@ import pandas as pd -import pygraphviz as pgv def visualize(nodes, edges, output_path): from SourceCodeTools.code.data.sourcetrail.sourcetrail_types import node_types + import pygraphviz as pgv global_types = list(node_types.values()) edges = edges[edges["type"].apply(lambda x: not x.endswith("_rev"))] - from SourceCodeTools.code.data.sourcetrail.Dataset import get_global_edges, ensure_connectedness - # def remove_ast_edges(nodes, edges): # global_edges = get_global_edges() # global_edges.add("subword") @@ -32,6 +30,9 @@ def visualize(nodes, edges, output_path): g = pgv.AGraph(strict=False, directed=True) + from SourceCodeTools.code.ast.python_ast2 import PythonNodeEdgeDefinitions + auxiliaty_edge_types = PythonNodeEdgeDefinitions.auxiliary_edges() + for ind, edge in edges.iterrows(): src = edge['source_node_id'] dst = edge['target_node_id'] @@ -39,7 +40,7 @@ def visualize(nodes, edges, output_path): dst_name = id2name[dst] g.add_node(src_name, color="blue" if id2type[src] in global_types else "black") g.add_node(dst_name, color="blue" if id2type[dst] in global_types else "black") - g.add_edge(src_name, dst_name) + g.add_edge(src_name, dst_name, color="blue" if edge['type'] in auxiliaty_edge_types else "black") g_edge = g.get_edge(src_name, dst_name) g_edge.attr['label'] = edge['type'] diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_edges_name_resolve.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_edges_name_resolve.py index 8e69b141..3e9345d0 100644 --- a/SourceCodeTools/code/data/sourcetrail/sourcetrail_edges_name_resolve.py +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_edges_name_resolve.py @@ -1,6 +1,5 @@ -import pandas as p -import sys, os -from SourceCodeTools.code.data.sourcetrail.file_utils import * +import sys +from SourceCodeTools.code.data.file_utils import * nodes = unpersist(sys.argv[1]) edges = unpersist(sys.argv[2]) diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_extract_node_names.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_extract_node_names.py index bbf490bd..da50d1a2 100644 --- a/SourceCodeTools/code/data/sourcetrail/sourcetrail_extract_node_names.py +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_extract_node_names.py @@ -1,6 +1,7 @@ import sys -from SourceCodeTools.code.data.sourcetrail.file_utils import * +from SourceCodeTools.code.data.ast_graph.extract_node_names import extract_node_names +from SourceCodeTools.code.data.file_utils import * def get_node_name(full_name): @@ -10,25 +11,13 @@ def get_node_name(full_name): return full_name.split(".")[-1].split("___")[0] -def extract_node_names(nodes, min_count): - - +def extract_node_names_(nodes, min_count): # some cells are empty, probably because of empty strings in AST # data = nodes.dropna(axis=0) # data = data[data['type'] != 262144] - data = nodes - data['src'] = data['id'] - data['dst'] = data['serialized_name'].apply(get_node_name) - - counts = data['dst'].value_counts() - - data['counts'] = data['dst'].apply(lambda x: counts[x]) - data = data.query(f"counts >= {min_count}") - - if len(data) > 0: - return data[['src', 'dst']] - else: - return None + nodes = nodes.copy() + nodes['serialized_name'] = nodes['serialized_name'].apply(get_node_name) + return extract_node_names(nodes, min_count) if __name__ == "__main__": diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_extract_variable_names.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_extract_variable_names.py index 8378163e..553e763e 100644 --- a/SourceCodeTools/code/data/sourcetrail/sourcetrail_extract_variable_names.py +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_extract_variable_names.py @@ -3,8 +3,8 @@ import sys from collections import Counter -from SourceCodeTools.code.data.sourcetrail.common import custom_tqdm -from SourceCodeTools.code.data.sourcetrail.file_utils import * +from SourceCodeTools.code.common import custom_tqdm +from SourceCodeTools.code.data.file_utils import * pandas.options.mode.chained_assignment = None diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_filter_ambiguous_edges.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_filter_ambiguous_edges.py index 403c6fa2..16d55342 100644 --- a/SourceCodeTools/code/data/sourcetrail/sourcetrail_filter_ambiguous_edges.py +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_filter_ambiguous_edges.py @@ -1,7 +1,7 @@ #%% import sys -from SourceCodeTools.code.data.sourcetrail.file_utils import * +from SourceCodeTools.code.data.file_utils import * def filter_ambiguous_edges(edges, ambiguous_edges): diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_filter_type_edges.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_filter_type_edges.py new file mode 100644 index 00000000..6cf74270 --- /dev/null +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_filter_type_edges.py @@ -0,0 +1,21 @@ +import sys +from os.path import join + +from SourceCodeTools.code.data.ast_graph.filter_type_edges import filter_type_edges +from SourceCodeTools.code.data.file_utils import * + + +def main(): + working_directory = sys.argv[1] + nodes = unpersist(join(working_directory, "nodes.bz2")) + edges = unpersist(join(working_directory, "edges.bz2")) + out_annotations = sys.argv[2] + out_no_annotations = sys.argv[3] + + no_annotations, annotations = filter_type_edges(nodes, edges) + + persist(annotations, out_annotations) + persist(no_annotations, out_no_annotations) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_fn_visualizer.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_fn_visualizer.py new file mode 100644 index 00000000..f44ca3c5 --- /dev/null +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_fn_visualizer.py @@ -0,0 +1,35 @@ +from os.path import join + +from SourceCodeTools.code.data.dataset.Dataset import load_data +from SourceCodeTools.code.data.sourcetrail.sourcetrail_draw_graph import visualize + + +def main(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("working_directory") + parser.add_argument("function_id", type=int) + parser.add_argument("output_path") + + args = parser.parse_args() + + nodes_path = join(args.working_directory, "common_nodes.bz2") + edges_path = join(args.working_directory, "common_edges.bz2") + + nodes, edges = load_data(nodes_path, edges_path, rename_columns=False) + + fn_edges = edges.query(f"mentioned_in == {args.function_id}") + fn_node_ids = set(fn_edges["source_node_id"] + fn_edges["target_node_id"]) + registered_edges = set(fn_edges["id"]) + extra_edges = edges[ + (edges["source_node_id"].apply(lambda id_: id_ in fn_node_ids) | + edges["target_node_id"].apply(lambda id_: id_ in fn_node_ids) ) #& + # edges["id"].apply(lambda id_: id_ not in registered_edges) + ] + fn_nodes = nodes[ + nodes["id"].apply(lambda id_: id_ in fn_node_ids) + ] + visualize(nodes, fn_edges, args.output_path) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_get_module_sizes.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_get_module_sizes.py index 6eac0345..c9da6c51 100644 --- a/SourceCodeTools/code/data/sourcetrail/sourcetrail_get_module_sizes.py +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_get_module_sizes.py @@ -2,9 +2,7 @@ from collections import Counter from pprint import pprint -from SourceCodeTools.code.data.sourcetrail.file_utils import unpersist -from SourceCodeTools.code.data.sourcetrail.sourcetrail_types import node_types -import pandas as pd +from SourceCodeTools.code.data.file_utils import unpersist # def estimate_module_sizes(nodes_path): diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_graph_complexity_analysis.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_graph_complexity_analysis.py new file mode 100644 index 00000000..f3c552ef --- /dev/null +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_graph_complexity_analysis.py @@ -0,0 +1,309 @@ +import logging +import pickle +from collections import defaultdict +from functools import partial +from os.path import join, dirname + +import dgl +import pandas as pd +import argparse +import numpy as np + +import matplotlib.pyplot as plt +from matplotlib.ticker import MaxNLocator +from tqdm import tqdm + +from SourceCodeTools.code.ast import has_valid_syntax +from SourceCodeTools.code.data.file_utils import unpersist +from SourceCodeTools.code.data.sourcetrail.sourcetrail_ast_edges2 import AstProcessor, standardize_new_edges, \ + ReplacementNodeResolver, MentionTokenizer +from SourceCodeTools.code.data.sourcetrail.sourcetrail_compute_function_diameter import compute_diameter +# from SourceCodeTools.code.data.type_annotation_dataset.create_type_annotation_dataset import get_docstring, \ +# remove_offsets +from SourceCodeTools.nlp import create_tokenizer + + +import tokenize +from io import StringIO +def remove_comments_and_docstrings(source): + """ + Returns 'source' minus comments and docstrings. + https://stackoverflow.com/questions/1769332/script-to-remove-python-comments-docstrings + """ + io_obj = StringIO(source) + out = "" + prev_toktype = tokenize.INDENT + last_lineno = -1 + last_col = 0 + for tok in tokenize.generate_tokens(io_obj.readline): + token_type = tok[0] + token_string = tok[1] + start_line, start_col = tok[2] + end_line, end_col = tok[3] + ltext = tok[4] + # The following two conditionals preserve indentation. + # This is necessary because we're not using tokenize.untokenize() + # (because it spits out code with copious amounts of oddly-placed + # whitespace). + if start_line > last_lineno: + last_col = 0 + if start_col > last_col: + out += (" " * (start_col - last_col)) + # Remove comments: + if token_type == tokenize.COMMENT: + pass + # This series of conditionals removes docstrings: + elif token_type == tokenize.STRING: + if prev_toktype != tokenize.INDENT: + # This is likely a docstring; double-check we're not inside an operator: + if prev_toktype != tokenize.NEWLINE: + if start_col > 0: + # Unlabelled indentation means we're inside an operator + out += token_string + else: + out += token_string + prev_toktype = token_type + last_col = end_col + last_lineno = end_line + return out + + +def process_code(source_file_content, node_resolver, mention_tokenizer): + try: + ast_processor = AstProcessor(source_file_content) + except: + logging.warning("Unknown exception") + return None + try: # TODO recursion error does not appear consistently. The issue is probably with library versions... + edges = ast_processor.get_edges(as_dataframe=False) + except RecursionError: + return None + + if len(edges) == 0: + return None + + edges = standardize_new_edges(edges, node_resolver, mention_tokenizer) + + return edges + + +def compute_gnn_passings(body, mention_tokenizer, num_layers=None): + + node_resolver = ReplacementNodeResolver() + + source_file_content = body.lstrip() + + edges = process_code( + source_file_content, node_resolver, mention_tokenizer + ) + + if edges is None: + return None + + edges = pd.DataFrame(edges).rename({"src": "source_node_id", "dst": "target_node_id"}, axis=1) + diameter = compute_diameter(edges, func_id=0) + + unique_nodes = set(edges["source_node_id"]) | set(edges["target_node_id"]) + + G = dgl.DGLGraph() + G.add_edges(edges["source_node_id"], edges["target_node_id"]) + + def compute_for_n_layers(n_layers): + sampler = dgl.dataloading.MultiLayerFullNeighborSampler(n_layers) + + for node in G.nodes(): + non_zero_blocks = 0 + num_edges = 0 + loader = dgl.dataloading.NodeDataLoader( + G, [node], sampler, batch_size=1, shuffle=True, num_workers=0) + for input_nodes, seeds, blocks in loader: + num_edges = 0 + for block in blocks: + if block.num_edges() > 0: + non_zero_blocks += 1 + num_edges += block.num_edges() + if num_edges != 0 and non_zero_blocks >= n_layers: + break + return num_edges + + passings_num_layers = compute_for_n_layers(num_layers) + passings_diameter = compute_for_n_layers(diameter) + + return len(edges), passings_num_layers, passings_diameter, diameter, len(unique_nodes) + + +def compute_transformer_passings(body, bpe): + num_tokens = len(bpe(body)) + return num_tokens + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("bodies") + parser.add_argument("bpe_path") + parser.add_argument("--num_layers", default=8, type=int) + + args = parser.parse_args() + + bodies = unpersist(args.bodies) + bpe = create_tokenizer(type="bpe", bpe_path=args.bpe_path) + mention_tokenizer = MentionTokenizer(args.bpe_path, create_subword_instances=False, connect_subwords=False) + + # lengths_tr = defaultdict(list) + # lengths_gnn_layers = defaultdict(list) + # lengths_gnn_diameter = defaultdict(list) + # ratio_trans = [] + # ratio_layers = [] + # ratio_diameter = [] + # diameters = [] + # node_ratio = [] + # + # for body in tqdm(bodies["body"]): + # if not has_valid_syntax(body): + # continue + # + # body_ = body + # body = body_.lstrip() + # initial_strip = body[:len(body_) - len(body)] + # + # # docsting_offsets = get_docstring(body) + # # body, replacements, docstrings = remove_offsets(body, [], docsting_offsets) + # body = remove_comments_and_docstrings(body) + # + # n_tokens = compute_transformer_passings(body, bpe) + # result = compute_gnn_passings(body, mention_tokenizer, args.num_layers) + # if result is None: + # continue + # n_edges, n_passings, n_passings_diam, diameter, num_nodes = result + # + # if num_nodes / n_tokens < 0.3: + # print() + # + # # lengths_tr[n_tokens].append(n_tokens ** 2 * args.num_layers) + # # lengths_gnn_layers[n_tokens].append(n_passings)# * args.num_layers) + # # lengths_gnn_diameter[n_tokens].append(n_passings_diam) # * args.num_layers) + # ratio_trans.append((n_tokens, n_tokens ** 2 * args.num_layers)) + # ratio_layers.append((n_tokens, n_passings)) + # ratio_diameter.append((n_tokens, n_passings_diam)) + # node_ratio.append((n_tokens, num_nodes)) + # diameters.append((n_tokens, diameter)) + # + # for key in lengths_tr: + # data_tr = np.array(lengths_tr[key]) + # data_gnn_layers = np.array(lengths_gnn_layers[key]) + # data_gnn_diameter = np.array(lengths_gnn_diameter[key]) + # + # # for key in lengths_tr: + # # data_tr = np.array(lengths_tr[key]) + # # data_gnn_layers = np.array(lengths_gnn_layers[key]) + # # data_gnn_diameter = np.array(lengths_gnn_diameter[key]) + # # + # # lengths_tr[key] = np.mean(data_tr)#, np.std(data_tr)) + # # lengths_gnn_layers[key] = np.mean(data_gnn_layers)#, np.std(data_gnn)) + # # lengths_gnn_diameter[key] = np.mean(data_gnn_diameter) + # + # data_node_ratio = np.array(node_ratio) + # data_ratios_trans = np.array(ratio_trans) + # data_ratios_layers = np.array(ratio_layers) + # data_ratios_diameter = np.array(ratio_diameter) + # data_diameters = np.array(diameters) + + get_path = partial(join, dirname(args.bodies)) + + # np.save(get_path("data_node_ratio.npy"), data_node_ratio) + # np.save(get_path("data_ratios_layers.npy"), data_ratios_layers) + # np.save(get_path("data_ratios_diameter.npy"), data_ratios_diameter) + # np.save(get_path("data_ratios_trans.npy"), data_ratios_trans) + # np.save(get_path("data_diameters.npy"), data_diameters) + # # pickle.dump(lengths_tr, open(get_path("lengths_tr.pkl"), "wb")) + # # pickle.dump(lengths_gnn_layers, open(get_path("lengths_gnn_layers.pkl"), "wb")) + # # pickle.dump(lengths_gnn_diameter, open(get_path("lengths_gnn_diameter.pkl"), "wb")) + + data_ratios_trans = np.load(get_path("data_ratios_trans.npy")) + data_node_ratio = np.load(get_path("data_node_ratio.npy")) + data_ratios_layers = np.load(get_path("data_ratios_layers.npy")) + data_ratios_diameter = np.load(get_path("data_ratios_diameter.npy")) + data_diameters = np.load(get_path("data_diameters.npy")) + # lengths_tr = pickle.load(open(get_path("lengths_tr.pkl"), "rb")) + # lengths_gnn_layers = pickle.load(open(get_path("lengths_gnn_layers.pkl"), "rb")) + # lengths_gnn_diameter = pickle.load(open(get_path("lengths_gnn_diameter.pkl"), "rb")) + + # plt.plot(data_ratios_layers[:, 0], data_ratios_layers[:, 1], "*") + # plt.plot(data_ratios_diameter[:, 0], data_ratios_diameter[:, 1], "*") + # plt.xlabel("Число токенов") + # plt.ylabel("Число обменов сообщениями") + # plt.legend([f"Количество слоёв = {args.num_layers}", "Количество слоёв = Диаметр графа"]) + # plt.savefig(join(dirname(args.bodies), "tokens_edges.pdf")) + # plt.close() + + print("Average diameter:", sum(data_diameters[:, 1]) / len(data_diameters[:, 1])) + + plt.hexbin(data_diameters[:, 0], data_diameters[:, 1], bins="log", cmap="magma_r", gridsize=150) + plt.xlabel("Число токенов в исходном коде") + plt.ylabel("Диметр графа исходного кода") + plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True)) + # plt.legend([f"Количество слоёв = {args.num_layers}", "Количество слоёв = Диаметр графа"]) + plt.savefig(join(dirname(args.bodies), "token_diameters.pdf")) + plt.close() + + plt.hist(data_node_ratio[:, 1] / data_node_ratio[:, 0], bins=20, range=(0, 2), alpha=0.7) + plt.xlabel("Число узлов в графе / Число токенов в исходном коде") + plt.ylabel("Частота") + # plt.legend([f"Количество слоёв = {args.num_layers}", "Количество слоёв = Диаметр графа"]) + plt.savefig(join(dirname(args.bodies), "ratio_nodes.pdf")) + plt.close() + + # plt.hist(data_ratios_layers[:, 1] / data_ratios_layers[:, 0], bins=20, range=(0,30), alpha=0.7) + # plt.hist(data_ratios_diameter[:, 1] / data_ratios_diameter[:, 0], bins=20, range=(0,30), alpha=0.7) + # plt.xlabel("Число обменов сообщениями / Число токенов") + # plt.ylabel("Частота") + # plt.legend([f"Количество слоёв = {args.num_layers}", "Количество слоёв = Диаметр графа"]) + # plt.savefig(join(dirname(args.bodies), "ratio.pdf")) + # plt.close() + + # ratio_layers = data_ratios_layers[:, 1] / data_ratios_layers[:, 0] + # ratio_layers = (np.mean(ratio_layers), np.std(ratio_layers)) + # + # ratio_diameter = data_ratios_diameter[:, 1] / data_ratios_diameter[:, 0] + # ratio_diameter = (np.mean(ratio_diameter), np.std(ratio_diameter)) + + plt.figure() + plt.xlim([0, 1750]) + plt.ylim([0, 5e4]) + plt.plot(data_ratios_trans[:, 0], data_ratios_trans[:, 1], "o", alpha=0.3, markersize=2, markeredgewidth=0) + plt.plot(data_ratios_layers[:, 0], data_ratios_layers[:, 1], "o", alpha=0.3, markersize=2, markeredgewidth=0) + plt.plot(data_ratios_diameter[:, 0], data_ratios_diameter[:, 1], "o", alpha=0.3, markersize=2, markeredgewidth=0) + # plt.gca().set_yscale('log') + plt.grid() + plt.legend([ + f"Трансформер, количество слоёв, = {args.num_layers}", + f"GNN, количество слоёв = {args.num_layers}", + f"GNN, количество слоёв = Диаметр графа"] + ) + plt.xlabel("Число токенов") + plt.ylabel("Число обменов сообщениями") + plt.savefig(join(dirname(args.bodies), "avg_passings.pdf")) + plt.close() + + plt.plot([], [], ".") + plt.plot(data_node_ratio[:, 1], data_ratios_layers[:, 1], ".", alpha=0.1) + plt.plot(data_node_ratio[:, 1], data_ratios_diameter[:, 1], ".", alpha=0.1) + # plt.gca().set_yscale('log') + plt.ylim([0, 1e4]) + plt.grid() + plt.legend([ + "", + f"Количество слоёв, GNN = {args.num_layers}", + f"Количество слоёв, GNN = Диаметр графа"] + ) + plt.xlabel("Число узлов в графе исходного кода") + plt.ylabel("Число обменов сообщениями") + plt.savefig(join(dirname(args.bodies), "avg_passings_vs_nodes.pdf")) + plt.close() + + + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_map_id_columns.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_map_id_columns.py index 6820849a..bbbaa38e 100644 --- a/SourceCodeTools/code/data/sourcetrail/sourcetrail_map_id_columns.py +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_map_id_columns.py @@ -1,24 +1,8 @@ import sys -from typing import Iterable -from SourceCodeTools.code.data.sourcetrail.common import \ - map_id_columns, merge_with_file_if_exists, create_local_to_global_id_map -from SourceCodeTools.code.data.sourcetrail.file_utils import * - - -def map_columns(input_table, id_map, columns, columns_special=None): - - input_table = map_id_columns(input_table, columns, id_map) - - if columns_special is not None: - assert isinstance(columns_special, list), "`columns_special` should be iterable" - for column, map_func in columns_special: - input_table[column] = map_func(input_table[column], id_map) - - if len(input_table) == 0: - return None - else: - return input_table +from SourceCodeTools.code.common import map_columns, merge_with_file_if_exists +from SourceCodeTools.code.data.ast_graph.local2global import create_local_to_global_id_map +from SourceCodeTools.code.data.file_utils import * if __name__ == "__main__": diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_map_id_columns_only_annotations.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_map_id_columns_only_annotations.py index d022b1da..047ccbe6 100644 --- a/SourceCodeTools/code/data/sourcetrail/sourcetrail_map_id_columns_only_annotations.py +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_map_id_columns_only_annotations.py @@ -1,7 +1,8 @@ import sys -from SourceCodeTools.code.data.sourcetrail.common import map_id_columns, merge_with_file_if_exists, create_local_to_global_id_map -from SourceCodeTools.code.data.sourcetrail.file_utils import * +from SourceCodeTools.code.common import map_id_columns, merge_with_file_if_exists +from SourceCodeTools.code.data.ast_graph.local2global import create_local_to_global_id_map +from SourceCodeTools.code.data.file_utils import * def map_columns_with_annotations(input_table, id_map, columns): diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_merge_graphs.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_merge_graphs.py index 4d68195d..471b2a56 100644 --- a/SourceCodeTools/code/data/sourcetrail/sourcetrail_merge_graphs.py +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_merge_graphs.py @@ -1,7 +1,7 @@ import sys -from SourceCodeTools.code.data.sourcetrail.file_utils import * -from SourceCodeTools.code.data.sourcetrail.common import merge_with_file_if_exists +from SourceCodeTools.code.common import merge_with_file_if_exists +from SourceCodeTools.code.data.file_utils import * pd.options.mode.chained_assignment = None @@ -43,7 +43,7 @@ def merge_global_with_local(existing_nodes, next_valid_id, local_nodes): assert len(new_nodes) == len(set(new_nodes['node_repr'].to_list())) - ids_start = next_valid_id + ids_start = int(next_valid_id) ids_end = ids_start + len(new_nodes) new_nodes['id'] = list(range(ids_start, ids_end)) diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_node_local2global.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_node_local2global.py index 27280010..93a28b91 100644 --- a/SourceCodeTools/code/data/sourcetrail/sourcetrail_node_local2global.py +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_node_local2global.py @@ -1,14 +1,14 @@ import sys -from SourceCodeTools.code.data.sourcetrail.file_utils import * -from SourceCodeTools.code.data.sourcetrail.common import create_local_to_global_id_map +from SourceCodeTools.code.data.ast_graph.local2global import create_local_to_global_id_map +from SourceCodeTools.code.data.file_utils import * def get_local2global(global_nodes, local_nodes) -> pd.DataFrame: local_nodes = local_nodes.copy() id_map = create_local_to_global_id_map(local_nodes=local_nodes, global_nodes=global_nodes) - local_nodes['global_id'] = local_nodes['id'].apply(lambda x: id_map.get(x, -1)) + local_nodes['global_id'] = local_nodes['id'].apply(lambda x: id_map.get(x, None)) return local_nodes[['id', 'global_id']] diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_node_name_merge.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_node_name_merge.py index 22715a08..6f4d5fc5 100644 --- a/SourceCodeTools/code/data/sourcetrail/sourcetrail_node_name_merge.py +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_node_name_merge.py @@ -1,6 +1,6 @@ import sys from SourceCodeTools.code.data.sourcetrail.sourcetrail_types import node_types -from SourceCodeTools.code.data.sourcetrail.file_utils import * +from SourceCodeTools.code.data.file_utils import * # needs testing def normalize(line): diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_parse_bodies2.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_parse_bodies2.py index a127ff49..f80953ed 100644 --- a/SourceCodeTools/code/data/sourcetrail/sourcetrail_parse_bodies2.py +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_parse_bodies2.py @@ -2,8 +2,10 @@ import sys from typing import Tuple, List, Optional +from SourceCodeTools.code.ast import has_valid_syntax +from SourceCodeTools.code.common import custom_tqdm, SQLTable from SourceCodeTools.code.data.sourcetrail.common import * -from SourceCodeTools.nlp.entity.annotator.annotator_utils import to_offsets +from SourceCodeTools.code.annotator_utils import to_offsets pd.options.mode.chained_assignment = None # default='warn' @@ -60,7 +62,10 @@ def get_function_body(file_content, file_id, start, end, s_col, e_col) -> str: source_lines = file_content[file_id].split("\n") - body_lines = source_lines[start: end] + if start == end: # handle situations when the entire function takes only one line + body_lines = source_lines[start] + else: + body_lines = source_lines[start: end] initial_strip = body_lines[0][0:len(body_lines[0]) - len(body_lines[0].lstrip())] body = initial_strip + file_content[file_id][offsets[0][0]: offsets[0][1]] @@ -68,14 +73,6 @@ def get_function_body(file_content, file_id, start, end, s_col, e_col) -> str: return body -def has_valid_syntax(function_body): - try: - ast.parse(function_body.lstrip()) - return True - except SyntaxError: - return False - - def get_range_for_replacement(occurrence, start_col, end_col, line, nodeid2name): extended_range = extend_range(start_col, end_col, line) @@ -100,6 +97,15 @@ def get_range_for_replacement(occurrence, start_col, end_col, line, nodeid2name) def process_body(body, local_occurrences, nodeid2name, f_id, f_start): + """ + Extract the list + :param body: + :param local_occurrences: + :param nodeid2name: + :param f_id: + :param f_start: + :return: + """ body_lines = body.split("\n") local_occurrences = sort_occurrences(local_occurrences) @@ -141,6 +147,16 @@ def process_body(body, local_occurrences, nodeid2name, f_id, f_start): def process_bodies(nodes, edges, source_location, occurrence, file_content, lang): + """ + :param nodes: + :param edges: + :param source_location: + :param occurrence: + :param file_content: + :param lang: + :return: Dataframe with columns id, body, sourcetrail_node_offsets. Node offsets are not resolved from the + global graph. + """ occurrence_groups = get_occurrence_groups(nodes, edges, source_location, occurrence) @@ -153,7 +169,7 @@ def process_bodies(nodes, edges, source_location, occurrence, file_content, lang for group_ind, (file_id, occurrences) in custom_tqdm( enumerate(occurrence_groups), message="Processing function bodies", total=len(occurrence_groups) ): - sql_occurrences = SQLTable(occurrences, "/tmp/sourcetrail_occurrences.db", "occurrences") + sql_occurrences = SQLTable(occurrences, ":memory:", "occurrences") function_definitions = sql_get_function_definitions(sql_occurrences) diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_types.py b/SourceCodeTools/code/data/sourcetrail/sourcetrail_types.py index bbad1d77..dc442261 100644 --- a/SourceCodeTools/code/data/sourcetrail/sourcetrail_types.py +++ b/SourceCodeTools/code/data/sourcetrail/sourcetrail_types.py @@ -15,7 +15,8 @@ 2: "uses_type", # from user to type 16: "inheritance", 4: "uses", # from user to item - 512: "imports" # from module to imported object + 512: "imports", # from module to imported object + 32: "non-idexed-package" } diff --git a/SourceCodeTools/code/data/type_annotation_dataset/create_type_annotation_aware_graph_partition.py b/SourceCodeTools/code/data/type_annotation_dataset/create_type_annotation_aware_graph_partition.py new file mode 100644 index 00000000..a7a5e4b0 --- /dev/null +++ b/SourceCodeTools/code/data/type_annotation_dataset/create_type_annotation_aware_graph_partition.py @@ -0,0 +1,91 @@ +import argparse +import json +from os.path import join +from random import random + +import pandas as pd + +from SourceCodeTools.code.common import read_nodes +from SourceCodeTools.code.data.file_utils import persist + + +def add_splits(items, train_frac, restricted_id_pool=None, force_test=None): + items = items.copy() + + if force_test is None: + force_test = set() + + def random_partition(node_id): + r = random() + if node_id not in force_test: + if r < train_frac: + return "train" + elif r < train_frac + (1 - train_frac) / 2: + return "val" + else: + return "test" + else: + if r < .5: + return "val" + else: + return "test" + + import numpy as np + # define partitioning + masks = np.array([random_partition(node_id) for node_id in items["id"]]) + + # create masks + items["train_mask"] = masks == "train" + items["val_mask"] = masks == "val" + items["test_mask"] = masks == "test" + + if restricted_id_pool is not None: + # if `restricted_id_pool` is provided, mask all nodes not in `restricted_id_pool` negatively + to_keep = items.eval("id in @restricted_ids", local_dict={"restricted_ids": restricted_id_pool}) + items["train_mask"] = items["train_mask"] & to_keep + items["test_mask"] = items["test_mask"] & to_keep + items["val_mask"] = items["val_mask"] & to_keep + + return items + + +def read_test_set_nodes(path): + node_ids = set() + with open(path, "r") as dataset: + for line in dataset: + if line.strip() == "": + continue + + text, entry = json.loads(line) + + for _, _, node_id in entry["replacements"]: + node_ids.add(node_id) + + return node_ids + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("working_directory") + parser.add_argument("type_annotation_test_set") + parser.add_argument("output_path") + + args = parser.parse_args() + + all_nodes = [] + for nodes in read_nodes(join(args.working_directory, "common_nodes.json.bz2"), as_chunks=True): + all_nodes.append(nodes[["id"]]) + + partition = add_splits( + items=pd.concat(all_nodes), + train_frac=0.8, + force_test=read_test_set_nodes(args.type_annotation_test_set) + ) + + persist(partition, args.output_path) + + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/SourceCodeTools/code/data/sourcetrail/sourcetrail_create_type_annotation_dataset.py b/SourceCodeTools/code/data/type_annotation_dataset/create_type_annotation_dataset.py similarity index 59% rename from SourceCodeTools/code/data/sourcetrail/sourcetrail_create_type_annotation_dataset.py rename to SourceCodeTools/code/data/type_annotation_dataset/create_type_annotation_dataset.py index 731b93ef..702e7c4e 100644 --- a/SourceCodeTools/code/data/sourcetrail/sourcetrail_create_type_annotation_dataset.py +++ b/SourceCodeTools/code/data/type_annotation_dataset/create_type_annotation_dataset.py @@ -2,13 +2,14 @@ import json import logging import os +from os.path import join import pandas as pd from tqdm import tqdm -from SourceCodeTools.code.data.sourcetrail.file_utils import unpersist, unpersist_if_present +from SourceCodeTools.code.data.file_utils import unpersist, unpersist_if_present from SourceCodeTools.nlp import create_tokenizer -from SourceCodeTools.nlp.entity.annotator.annotator_utils import to_offsets, adjust_offsets2, \ +from SourceCodeTools.code.annotator_utils import to_offsets, adjust_offsets2, \ resolve_self_collisions2 from SourceCodeTools.nlp.spacy_tools import isvalid @@ -82,19 +83,34 @@ def correct_entities(entities, removed_offsets): for offset_len, offset in zip(offset_lens, offsets_sorted): new_entities = [] for entity in for_correction: - if offset[0] <= entity[0] and offset[1] <= entity[0]: + if offset[0] <= entity[0] and offset[1] <= entity[0]: # removed span is to the left of the entitity if len(entity) == 2: new_entities.append((entity[0] - offset_len, entity[1] - offset_len)) elif len(entity) == 3: new_entities.append((entity[0] - offset_len, entity[1] - offset_len, entity[2])) else: raise Exception("Invalid entity size") - elif offset[0] >= entity[1] and offset[1] >= entity[1]: + elif offset[0] >= entity[1] and offset[1] >= entity[1]: # removed span is to the right of the entitity new_entities.append(entity) - elif offset[0] <= entity[1] <= offset[1] or offset[0] <= entity[0] <= offset[1]: - pass # likely to be a type annotation being removed + elif offset[0] <= entity[0] <= offset[1] and offset[0] <= entity[1] <= offset[1]: # removed span covers the entity + pass + elif offset[0] <= entity[0] <= offset[1] and entity[0] <= offset[1] <= entity[1]: # removed span overlaps on the left + if len(entity) == 3: + new_entities.append((entity[0] - offset_len + offset[1] - entity[1], entity[1] - offset_len, entity[2])) + elif len(entity) == 2: + new_entities.append((entity[0] - offset_len + offset[1] - entity[1], entity[1] - offset_len)) + else: + raise Exception("Invalid entity size") + elif entity[0] <= offset[0] <= entity[1] and entity[0] <= entity[1] <= offset[1]: # removed span overlaps on the right + if len(entity) == 3: + new_entities.append((entity[0], entity[1] - offset_len + offset[1] - entity[1], entity[2])) + elif len(entity) == 2: + new_entities.append((entity[0], entity[1] - offset_len + offset[1] - entity[1])) + else: + raise Exception("Invalid entity size") else: - raise Exception("Invalid data?") + logging.warning(f"Encountered invalid offset: {entity}") + # raise Exception("Invalid data?") for_correction = new_entities @@ -160,6 +176,9 @@ def unpack_returns(body: str, labels: pd.DataFrame): :param labels: DataFrame with information about return type annotation :return: Trimmed body and list of return types (normally one). """ + if labels is None: + return [], [] + returns = [] for ind, row in labels.iterrows(): @@ -193,6 +212,22 @@ def unpack_returns(body: str, labels: pd.DataFrame): return ret, cuts +def get_defaults_spans(body): + root = ast.parse(body) + defaults_offsets = to_offsets( + body, + [(arg.lineno-1, arg.end_lineno-1, arg.col_offset, arg.end_col_offset, "default") for arg in root.body[0].args.defaults], + as_bytes=True + ) + + extended = [] + for start, end, label in defaults_offsets: + while body[start] != "=": + start -= 1 + extended.append((start, end)) + return extended + + def unpack_annotations(body, labels): """ Use information from ast package to strip type annotation from function body @@ -200,6 +235,11 @@ def unpack_annotations(body, labels): :param labels: DataFrame with information about type annotations :return: Trimmed body and list of annotations. """ + if labels is None: + return [], [] + + global remove_default + variables = [] annotations = [] @@ -215,6 +255,7 @@ def unpack_annotations(body, labels): # but type annotations usually appear in the end of signature and in the beginnig of a line variables = to_offsets(body, variables, as_bytes=True) annotations = to_offsets(body, annotations, as_bytes=True) + defaults_spans = get_defaults_spans(body) cuts = [] vars = [] @@ -225,21 +266,35 @@ def unpack_annotations(body, labels): head = body[:offset_ann[0]] orig_len = len(head) - head = head.rstrip() + head = head.rstrip(" \\\n\t") # include character \ since it can be used to indicate line break stripped_len = len(head) annsymbol = ":" assert head.endswith(annsymbol) beginning = beginning - (orig_len - stripped_len) - len(annsymbol) + # Workaround if there is a space before annotation. Example: "groupby : List[str]" + while beginning > 0 and body[beginning - 1] == " ": + beginning -= 1 cuts.append((beginning, end)) assert offset_var[0] != len(head) vars.append((offset_var[0], beginning, preprocess(body[offset_ann[0]: offset_ann[1]]))) + if remove_default: + cuts.extend(defaults_spans) + return vars, cuts -def process_body(nlp, body: str, replacements=None): +def body_valid(body): + try: + ast.parse(body) + return True + except: + return False + + +def process_body(nlp, body: str, replacements=None, require_labels=False): """ Extract annotation information, strip documentation and type annotations. :param nlp: Spacy tokenizer @@ -267,9 +322,10 @@ def process_body(nlp, body: str, replacements=None): body_, replacements, docstrings = remove_offsets(body_, replacements, docsting_offsets) entry['docstrings'].extend(docstrings) + was_valid = body_valid(body_) initial_labels = get_initial_labels(body_) - if initial_labels is None: + if require_labels and initial_labels is None: return None returns, return_cuts = unpack_returns(body_, initial_labels) @@ -277,6 +333,11 @@ def process_body(nlp, body: str, replacements=None): body_, replacements_annotations, _ = remove_offsets(body_, replacements + annotations, return_cuts + annotation_cuts) + is_valid = body_valid(body_) + if was_valid != is_valid: + print("Failed processing") + return None + # raise Exception() replacements_annotations = adjust_offsets2(replacements_annotations, len(initial_strip)) body_ = initial_strip + body_ @@ -288,7 +349,7 @@ def process_body(nlp, body: str, replacements=None): entry['replacements'] = resolve_self_collisions2(entry['replacements']) - assert isvalid(nlp, body_, entry['replacements']) + # assert isvalid(nlp, body_, entry['replacements']) assert isvalid(nlp, body_, entry['ents']) return entry @@ -386,25 +447,28 @@ def load_names(nodes_path): return names -def process_package(working_directory, global_names=None): +def process_package(working_directory, global_names=None, require_labels=False): """ Find functions with annotations, extract annotation information, strip documentation and type annotations. :param working_directory: location of package related files :param global_names: optional, mapping from global node ids to names :return: list of entries in spacy compatible format """ - bodies = unpersist_if_present(os.path.join(working_directory, "source_graph_bodies.bz2")) - if bodies is None: - return [] + # bodies = unpersist_if_present(os.path.join(working_directory, "source_graph_bodies.bz2")) + # if bodies is None: + # return [] - offsets_path = os.path.join(working_directory, "offsets.bz2") + # offsets_path = os.path.join(working_directory, "offsets.bz2") - # offsets store information about spans for nodes referenced in the source code - if os.path.isfile(offsets_path): - offsets = unpersist(offsets_path) - else: - logging.warning(f"No file with offsets: {offsets_path}") - offsets = None + # # offsets store information about spans for nodes referenced in the source code + # if os.path.isfile(offsets_path): + # offsets = unpersist(offsets_path) + # else: + # logging.warning(f"No file with offsets: {offsets_path}") + # offsets = None + + if not os.path.isfile(join(working_directory, "has_annotations")): + return [] def load_local2global(working_directory): local2global = unpersist(os.path.join(working_directory, "local2global_with_ast.bz2")) @@ -415,62 +479,184 @@ def load_local2global(working_directory): local_names = load_names(os.path.join(working_directory, "nodes_with_ast.bz2")) - nlp = create_tokenizer("spacy") + node_maps = get_node_maps(unpersist(join(working_directory, "nodes_with_ast.bz2"))) + filecontent = get_filecontent_maps(unpersist(join(working_directory, "filecontent_with_package.bz2"))) + offsets = group_offsets(unpersist(join(working_directory, "offsets.bz2"))) data = [] + nlp = create_tokenizer("spacy") - for ind, (_, row) in tqdm( - enumerate(bodies.iterrows()), total=len(bodies), - leave=True, desc=os.path.basename(working_directory) - ): - body = row['body'] - - if offsets is not None: - graph_node_spans = offsets_for_func(offsets, body, row["id"]) - else: - graph_node_spans = [] - - entry = process_body(nlp, body, replacements=graph_node_spans) + for ind, (f_body, f_offsets) in enumerate(iterate_functions(offsets, node_maps, filecontent)): + try: + entry = process_body(nlp, f_body, replacements=f_offsets, require_labels=require_labels) + except Exception as e: + logging.warning("Error during processing") + print(working_directory) + print(e) + continue if entry is not None: entry = to_global_ids(entry, id_maps, global_names, local_names) data.append(entry) + # nlp = create_tokenizer("spacy") + # + # data = [] + # + # for ind, (_, row) in tqdm( + # enumerate(bodies.iterrows()), total=len(bodies), + # leave=True, desc=os.path.basename(working_directory) + # ): + # body = row['body'] + # + # if offsets is not None: + # graph_node_spans = offsets_for_func(offsets, body, row["id"]) + # else: + # graph_node_spans = [] + # + # entry = process_body(nlp, body, replacements=graph_node_spans) + # + # if entry is not None: + # entry = to_global_ids(entry, id_maps, global_names, local_names) + # data.append(entry) + return data -def main(): +def iterate_functions(offsets, nodes, filecontent): + + allowed_entity_types = {"class_method", "function"} + + for package_id in offsets: + content = filecontent[package_id] + + # entry is a function or a class + for (entity_start, entity_end, entity_node_id), entity_offsets in offsets[package_id].items(): + if nodes[entity_node_id][1] in allowed_entity_types: + body = content[entity_start: entity_end] + adjusted_entity_offsets = adjust_offsets2(entity_offsets, -entity_start) + + yield body, adjusted_entity_offsets + + +def get_node_maps(nodes): + return dict(zip(nodes["id"], zip(nodes["serialized_name"], nodes["type"]))) + + +def get_filecontent_maps(filecontent): + return dict(zip(zip(filecontent["package"], filecontent["id"]), filecontent["content"])) + + +def group_offsets(offsets): + """ + :param offsets: Dataframe with offsets + :return: offsets grouped first by package name and file id, and then by the entity in which they occur. + """ + # This function will process all function that have graph annotations. If there are no + # annotations - the function is not processed. + offsets_grouped = {} + + for file_id, start, end, node_id, mentioned_in, package in offsets.values: + package_id = (package, file_id) + if package_id not in offsets_grouped: + offsets_grouped[package_id] = {} + + offset_ent = (start, end, node_id) + + for e in mentioned_in: + e = tuple(e) + if e not in offsets_grouped[package_id]: + offsets_grouped[package_id][e] = [] + + offsets_grouped[package_id][e].append(offset_ent) + + return offsets_grouped + + +def create_from_dataset(args): from argparse import ArgumentParser - parser = ArgumentParser() - parser.add_argument("packages", type=str, help="") - parser.add_argument("output_dataset", type=str, help="") - parser.add_argument("--format", "-f", dest="format", default="jsonl", help="jsonl|csv") - parser.add_argument("--global_nodes", "-g", dest="global_nodes", default=None) + # parser = ArgumentParser() + # parser.add_argument("dataset_path", type=str, help="") + # parser.add_argument("output_path", type=str, help="") + # parser.add_argument("--format", "-f", dest="format", default="jsonl", help="jsonl|csv") + # parser.add_argument("--remove_default", action="store_true", default=False) + # + # args = parser.parse_args() + + global remove_default + remove_default = args.remove_default + + node_maps = get_node_maps(unpersist(join(args.dataset_path, "common_nodes.json.bz2"))) + filecontent = get_filecontent_maps(unpersist(join(args.dataset_path, "common_filecontent.json.bz2"))) + offsets = group_offsets(unpersist(join(args.dataset_path, "common_offsets.json.bz2"))) + + data = [] + nlp = create_tokenizer("spacy") + + for ind, (f_body, f_offsets) in enumerate(iterate_functions(offsets, node_maps, filecontent)): + data.append(process_body(nlp, f_body, replacements=f_offsets, require_labels=args.require_labels)) + + store(data, args) - args = parser.parse_args() + +def create_from_environments(args): + # from argparse import ArgumentParser + # parser = ArgumentParser() + # parser.add_argument("packages", type=str, help="") + # parser.add_argument("output_path", type=str, help="") + # parser.add_argument("--format", "-f", dest="format", default="jsonl", help="jsonl|csv") + # parser.add_argument("--global_nodes", "-g", dest="global_nodes", default=None) + # parser.add_argument("--remove_default", default=False, action="store_true") + + # args = parser.parse_args() global_names = load_names(args.global_nodes) + global remove_default + remove_default = args.remove_default + data = [] - for package in os.listdir(args.packages): - pkg_path = os.path.join(args.packages, package) + for package in os.listdir(args.dataset_path): + pkg_path = os.path.join(args.dataset_path, package) if not os.path.isdir(pkg_path): continue - data.extend(process_package(working_directory=pkg_path, global_names=global_names)) + data.extend(process_package(working_directory=pkg_path, global_names=global_names, require_labels=args.require_labels)) + + store(data, args) + +def store(data, args): if args.format == "jsonl": # jsonl format is used by spacy - with open(args.output_dataset, "w") as sink: + with open(args.output_path, "w") as sink: for entry in data: sink.write(f"{json.dumps(entry)}\n") elif args.format == "csv": - if os.path.isfile(args.output_dataset): + if os.path.isfile(args.output_path): header = False else: header = True - pd.DataFrame(data).to_csv(args.output_dataset, index=False, header=header) + pd.DataFrame(data).to_csv(args.output_path, index=False, header=header) if __name__ == "__main__": - main() + from argparse import ArgumentParser + + parser = ArgumentParser() + parser.add_argument("dataset_format", type=str) + parser.add_argument("dataset_path", type=str, help="") + parser.add_argument("output_path", type=str, help="") + parser.add_argument("--format", "-f", dest="format", default="jsonl", help="jsonl|csv") + parser.add_argument("--remove_default", action="store_true", default=False) + parser.add_argument("--global_nodes", "-g", dest="global_nodes", default=None) + parser.add_argument("--require_labels", action="store_true", default=False) + args = parser.parse_args() + + if args.dataset_format == "envs": + create_from_environments(args) + elif args.dataset_format == "dataset": + create_from_dataset(args) + else: + raise ValueError("supperted dataset formats are: envs|dataset") + # remove_default = False diff --git a/SourceCodeTools/code/data/type_annotation_dataset/map_args_to_mentions.py b/SourceCodeTools/code/data/type_annotation_dataset/map_args_to_mentions.py new file mode 100644 index 00000000..75582ef7 --- /dev/null +++ b/SourceCodeTools/code/data/type_annotation_dataset/map_args_to_mentions.py @@ -0,0 +1,61 @@ +import argparse +import json +from os.path import join + +import pandas as pd + +from SourceCodeTools.code.common import read_nodes +from SourceCodeTools.code.data.dataset.Dataset import load_data +from SourceCodeTools.code.data.file_utils import unpersist + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("working_directory") + parser.add_argument("output") + parser.add_argument("--dataset_file", default=None) + args = parser.parse_args() + + if args.dataset_file is None: + args.dataset_file = join(args.working_directory, "function_annotations.jsonl") + + # nodes, edges = load_data(join(args.working_directory, "common_nodes.json.bz2"), join(args.working_directory, "common_edges.json.bz2")) + + arguments = set() + mentions = set() + for nodes in read_nodes(join(args.working_directory, "common_nodes.json.bz2"), as_chunks=True): + arguments.update(nodes.query("type == 'arg'")["id"]) + mentions.update(nodes.query("type == 'mention'")["id"]) + + # type_annotated = set(unpersist(join(args.working_directory, "type_annotations.json.bz2"))["src"].tolist()) + + scrutinize_edges = [] + + for edges in read_nodes(join(args.working_directory, "common_edges.json.bz2"), as_chunks=True): + edges = edges.query("(source_node_id in @mentions) and (target_node_id in @arguments)", local_dict={"arguments": arguments, "mentions": mentions}) + scrutinize_edges.append(edges) + + edges = pd.concat(scrutinize_edges) + mapping = {} + for src, dst in edges[["source_node_id", "target_node_id"]].values: + if dst in mapping: + print() + mapping[dst] = src + + with open(args.output, "w") as sink: + with open(args.dataset_file) as fa: + for line in fa: + entry = json.loads(line) + new_repl = [[s, e, int(mapping.get(r, r))] for s, e, r in entry["replacements"]] + entry["replacements"] = new_repl + + sink.write(f"{json.dumps(entry)}\n") + + + print() + + # pickle.dump(mapping, open(args.output, "wb")) + + +if __name__ == "__main__": + main() diff --git a/SourceCodeTools/code/data/type_annotation_dataset/split_dataset.py b/SourceCodeTools/code/data/type_annotation_dataset/split_dataset.py new file mode 100644 index 00000000..911cb8b2 --- /dev/null +++ b/SourceCodeTools/code/data/type_annotation_dataset/split_dataset.py @@ -0,0 +1,52 @@ +import argparse +import json +import os +from collections import Counter + +from SourceCodeTools.nlp.entity.utils.data import read_data + + +def get_all_annotations(dataset): + ann = [] + for _, annotations in dataset: + for _, _, e in annotations["entities"]: + ann.append(e) + return ann + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument('--data_path', dest='data_path', default=None, + help='Path to the dataset file') + parser.add_argument('--min_entity_count', dest='min_entity_count', default=3, type=int, + help='') + parser.add_argument('--random_seed', dest='random_seed', default=None, type=int, + help='') + parser.add_argument('--name_suffix', default="", type=str, + help='') + + args = parser.parse_args() + + train_data, test_data = read_data( + open(args.data_path, "r").readlines(), normalize=True, allowed=None, include_replacements=True, include_only="entities", + min_entity_count=args.min_entity_count, random_seed=args.random_seed + ) + + directory = os.path.dirname(args.data_path) + + def write_to_file(data, directory, suffix, partition): + with open(os.path.join(directory, f"type_prediction_dataset_{suffix}_{partition}.json"), "w") as sink: + for entry in data: + sink.write(f"{json.dumps(entry)}\n") + + write_to_file(train_data, directory, args.name_suffix, "train") + write_to_file(test_data, directory, args.name_suffix, "test") + + ent_counts = Counter(get_all_annotations(train_data)) | Counter(get_all_annotations(test_data)) + + with open(os.path.join(directory, f"type_prediction_dataset_{args.name_suffix}_annotations_counts.txt"), "w") as sink: + for ent, count in ent_counts.most_common(): + sink.write(f"{ent}\t{count}\n") + + print() \ No newline at end of file diff --git a/SourceCodeTools/code/detect_hierarchy.py b/SourceCodeTools/code/detect_hierarchy.py index 1b78c95a..937fcd2a 100644 --- a/SourceCodeTools/code/detect_hierarchy.py +++ b/SourceCodeTools/code/detect_hierarchy.py @@ -1,8 +1,8 @@ import argparse -from SourceCodeTools.code.data.sourcetrail.Dataset import load_data +from SourceCodeTools.code.data.dataset.Dataset import load_data from SourceCodeTools.code.data.sourcetrail.sourcetrail_types import edge_types -from SourceCodeTools.code.data.sourcetrail.file_utils import * +from SourceCodeTools.code.data.file_utils import * import networkx as nx from functools import lru_cache diff --git a/SourceCodeTools/code/experiments/Experiments.py b/SourceCodeTools/code/experiments/Experiments.py index 85bb8295..8518ea73 100644 --- a/SourceCodeTools/code/experiments/Experiments.py +++ b/SourceCodeTools/code/experiments/Experiments.py @@ -1,4 +1,7 @@ # %% +import json +from typing import Iterable + import pandas from os.path import join @@ -6,6 +9,8 @@ import numpy as np # from graphtools import Embedder +from SourceCodeTools.code.data.dataset.Dataset import filter_dst_by_freq +from SourceCodeTools.code.data.file_utils import unpersist from SourceCodeTools.models.Embedder import Embedder import pickle @@ -24,6 +29,13 @@ def get_nodes_with_split_ids(embedder, split_ids): # ]['id'].to_numpy() +def shuffle(X, y): + np.random.seed(42) + ind_shuffle = np.arange(0, X.shape[0]) + np.random.shuffle(ind_shuffle) + return X[ind_shuffle], y[ind_shuffle] + + class Experiments: """ Provides convenient interface for creating experiments. @@ -36,18 +48,11 @@ class Experiments: typeann - """ - def __init__(self, - base_path=None, - api_seq_path=None, - type_use_path=None, - type_link_path=None, - type_link_train_path=None, - type_link_test_path=None, - node_type_path=None, - variable_use_path=None, - function_name_path=None, - type_ann=None, - gnn_layer=-1): + def __init__( + self, base_path=None,api_seq_path=None, type_use_path=None, type_link_path=None, type_link_train_path=None, + type_link_test_path=None, node_type_path=None, variable_use_path=None, function_name_path=None, + type_ann=None, gnn_layer=-1, embeddings_path=None, **kwargs + ): """ :param base_path: path tp trained gnn model @@ -58,6 +63,7 @@ def __init__(self, :param function_name_path: path to function name edges :param gnn_layer: which gnn layer is used for node embeddings """ + self.kwargs = kwargs self.experiments = { 'fcall': function_name_path, @@ -78,7 +84,9 @@ def __init__(self, # self.splits = torch.load(os.path.join(self.base_path, "state_dict.pt"))["splits"] if base_path is not None: - self.embed = pickle.load(open(join(self.base_path, "embeddings.pkl"), "rb"))[gnn_layer] + self.embed = pickle.load(open(embeddings_path, "rb")) + if isinstance(self.embed, Iterable): + self.embed = self.embed[gnn_layer] # alternative_nodes = pickle.load(open("nodes.pkl", "rb")) # self.embed.e = alternative_nodes # self.embed.e = np.random.randn(self.embed.e.shape[0], self.embed.e.shape[1]) @@ -112,12 +120,12 @@ def __getitem__(self, type: str): :param type: str description of the experiment :return: Experiment object """ - nodes = pandas.read_csv(join(self.base_path, "nodes.csv")) - edges = pandas.read_csv(join(self.base_path, "held.csv")).astype({"src": "int32", "dst": "int32"}) - from SourceCodeTools.code.data.sourcetrail.Dataset import SourceGraphDataset + nodes = unpersist(join(self.base_path, "common_nodes.json.bz2")) + # edges = pandas.read_csv(join(self.base_path, "held.csv")).astype({"src": "int32", "dst": "int32"}) + edges = None # unpersist(join(self.base_path, "common_edges.json.bz2")) - self.splits = SourceGraphDataset.get_global_graph_id_splits(nodes) + # self.splits = SourceGraphDataset.get_global_graph_id_splits(nodes) # global_ids = nodes['global_graph_id'].values # self.splits = ( # global_ids[nodes['train_mask'].values], @@ -200,22 +208,113 @@ def __getitem__(self, type: str): compact_dst=False) elif type == "typeann": - type_ann = pandas.read_csv(self.experiments['typeann']).astype({"src": "int32", "dst": "str"}) - # node_pool = set(self.splits[2]).union(self.splits[1]).union(self.splits[0]) - # node_pool = set(nodes['id'].values.tolist()) - # node_pool = set(get_nodes_with_split_ids(nodes, set(self.splits[2]).union(self.splits[1]).union(self.splits[0]))) - node_pool = set( - get_nodes_with_split_ids(self.embed, set(self.splits[2]).union(self.splits[1]).union(self.splits[0]))) + random_seed = 42 + min_entity_count = 3 + + type_annotations_path = self.experiments['typeann'] + from pathlib import Path + dataset_dir = Path(type_annotations_path).parent + + type_ann = unpersist(self.experiments['typeann']) + + # node_names = dict(zip(nodes["id"], nodes["serialized_name"])) + + filter_rule = lambda name: "0x" not in name type_ann = type_ann[ - type_ann['src'].apply(lambda nid: nid in node_pool) + type_ann["dst"].apply(filter_rule) + ] + + node2id = dict(zip(nodes["id"], nodes["type"])) + type_ann["src_type"] = type_ann["src"].apply(lambda x: node2id[x]) + + type_ann = type_ann[ + type_ann["src_type"].apply(lambda type_: type_ in {"mention"}) + ] + + norm = lambda x: x.strip("\"").strip("'").split("[")[0].split(".")[-1] + + type_ann["dst"] = type_ann["dst"].apply(norm) + type_ann = filter_dst_by_freq(type_ann, min_entity_count) + + # this is used for codebert embeddings + filter_rule = lambda id_: id_ in self.embed.ind + + type_ann = type_ann[ + type_ann["src"].apply(filter_rule) ] + test_only_popular_types = self.kwargs["only_popular_types"] + if test_only_popular_types: + allowed = set(self.kwargs["popular_types"]) + + type_ann = type_ann[ + type_ann["dst"].apply(lambda type_: type_ in allowed) + ] + else: + allowed = None + + from SourceCodeTools.nlp.entity.type_prediction import filter_labels + def read_dataset_file(path): + with open(path, "r") as source: + return [json.loads(line) for line in source] + + train_data = filter_labels( + pickle.load(open(dataset_dir.joinpath("type_prediction_dataset_no_defaults_train.pkl"), "rb")), + # read_dataset_file(dataset_dir.joinpath("type_prediction_dataset_no_default_args_mapped_train.json")), + allowed=allowed + ) + test_data = filter_labels( + pickle.load(open(dataset_dir.joinpath("type_prediction_dataset_no_defaults_test.pkl"), "rb")), + # read_dataset_file(dataset_dir.joinpath("type_prediction_dataset_no_default_args_mapped_test.json")), + allowed=allowed + ) + + def get_ids(typeann_dataset): + ids = [] + for sent, annotations in typeann_dataset: + for _, _, r in annotations["replacements"]: + ids.append(r) + + return set(ids) + + train_ids = get_ids(train_data) + test_ids = get_ids(test_data) + + type_ann = type_ann[["src", "dst"]] + + # splits = get_train_val_test_indices(nodes.index) + # create_train_val_test_masks(nodes, *splits) + + # train_data, test_data = read_data( + # open(os.path.join(self.base_path, "function_annotations.jsonl"), "r").readlines(), + # normalize=True, allowed=None, include_replacements=True, + # include_only="entities", + # min_entity_count=min_entity_count, random_seed=random_seed + # ) + # + # train_nodes = set() + # for _, ann in train_data: + # for _, _, nid in ann["replacements"]: + # train_nodes.add(nid) + # + # test_nodes = set() + # for _, ann in test_data: + # for _, _, nid in ann["replacements"]: + # test_nodes.add(nid) + # + # type_nodes = set(type_ann["src"]) + # + # train_nodes = train_nodes.intersection(type_nodes) + # test_nodes = test_nodes.intersection(type_nodes) + # return Experiment2(self.embed, nodes, edges, type_ann, split_on="nodes", neg_sampling_strategy="word2vec") - return Experiment3(self.embed, nodes, edges, type_ann, split_on="edges", - neg_sampling_strategy="word2vec", compact_dst=True) + return Experiment3( + self.embed, nodes, edges, type_ann, split_on="edges", neg_sampling_strategy="word2vec", + compact_dst=True, train_ids=train_ids, test_ids=test_ids + ) elif type == "typeann_name": type_ann = pandas.read_csv(self.experiments['typeann']).astype({"src": "int32", "dst": "str"}) @@ -298,7 +397,8 @@ def __init__(self, split_on="nodes", neg_sampling_strategy="word2vec", K=1, - test_frac=0.1, compact_dst=True): + test_frac=0.1, compact_dst=True, + train_ids=None, test_ids=None): # store local copies" self.embed = embeddings @@ -310,6 +410,8 @@ def __init__(self, self.neg_smpl_strategy = neg_sampling_strategy self.K = K self.TEST_FRAC = test_frac + self.train_ids = train_ids + self.test_ids = test_ids # make sure to drop duplicate edges to prevent leakage into the test set # do it before creating experiment? @@ -391,11 +493,6 @@ def get_training_data(self): assert X_train.shape[0] == y_train.shape[0] assert X_test.shape[0] == y_test.shape[0] - def shuffle(X, y): - ind_shuffle = np.arange(0, X.shape[0]) - np.random.shuffle(ind_shuffle) - return X[ind_shuffle], y[ind_shuffle] - self.X_train, self.y_train = shuffle(X_train, y_train) self.X_test, self.y_test = shuffle(X_test, y_test) @@ -557,19 +654,20 @@ def __init__(self, embeddings: Embedder, split_on="nodes", neg_sampling_strategy="word2vec", K=1, - test_frac=0.1, compact_dst=True): + test_frac=0.1, compact_dst=True, + train_ids=None, test_ids=None): super(Experiment2, self).__init__(embeddings, nodes, edges, target, split_on=split_on, neg_sampling_strategy=neg_sampling_strategy, K=K, test_frac=test_frac, - compact_dst=compact_dst) + compact_dst=compact_dst, train_ids=train_ids, test_ids=test_ids) # TODO # make sure that compact_property work always identically between runs - self.name_map = compact_property(target['dst']) + self.name_map, self.inv_index = compact_property(target['dst'], return_order=True) self.dst_orig = target['dst'] target['orig_dst'] = target['dst'] target['dst'] = target['dst'].apply(lambda name: self.name_map[name]) - print(f"Doing experiment with {len(self.name_map)} distinct target targets") + print(f"Doing experiment with {len(self.name_map)} distinct targets") self.unique_src = self.target['src'].unique() self.unique_dst = self.target['dst'].unique() @@ -663,7 +761,15 @@ def __init__(self, embeddings: Embedder, neg_sampling_strategy=neg_sampling_strategy, K=K, test_frac=test_frac, compact_dst=compact_dst) - from SourceCodeTools.models.graph.ElementEmbedder import window, hashstr, create_fixed_length + from SourceCodeTools.models.graph.ElementEmbedder import create_fixed_length + import hashlib + def hashstr(s, num_buckets): + return int(hashlib.md5(s.encode('utf8')).hexdigest(), 16) % num_buckets + + def window(x, gram_size): + x = "<" + x + ">" + length = len(x) + return (x[i:i + gram_size] for i in range(0, length) if i+gram_size<=length) reprs = target['orig_dst'].map(lambda x: window(x, gram_size)) \ .map(lambda grams: (hashstr(g, num_buckets) for g in grams)) \ @@ -684,33 +790,40 @@ def get_negative_out(self, num): class Experiment3(Experiment2): - def __init__(self, embeddings: Embedder, - nodes: pandas.DataFrame, - edges: pandas.DataFrame, - target: pandas.DataFrame, - split_on="nodes", - neg_sampling_strategy="word2vec", - K=1, - test_frac=0.1, compact_dst=True - ): + def __init__( + self, embeddings: Embedder, + nodes: pandas.DataFrame, + edges: pandas.DataFrame, + target: pandas.DataFrame, + split_on="nodes", + neg_sampling_strategy="word2vec", + K=1, + test_frac=0.1, compact_dst=True, + train_ids=None, test_ids=None + ): super(Experiment3, self).__init__(embeddings, nodes, edges, target, split_on=split_on, neg_sampling_strategy=neg_sampling_strategy, K=K, test_frac=test_frac, - compact_dst=compact_dst) + compact_dst=compact_dst, train_ids=train_ids, test_ids=test_ids) def get_training_data(self): # self.get_train_test_split() - train_positive = self.target.iloc[self.train_edge_ind].values - test_positive = self.target.iloc[self.test_edge_ind].values + if self.train_ids is not None and self.test_ids is not None: + test_positive = self.target[ + self.target["src"].apply(lambda x: x in self.test_ids) + ].values + chosen_for_test = set(test_positive[:, 0].tolist()) + train_positive = self.target[ + self.target["src"].apply(lambda x: x in self.train_ids and x not in chosen_for_test) + ].values - X_train, y_train = train_positive[:, 0].reshape(-1, 1), train_positive[:, 1].reshape(-1, 1) - X_test, y_test = test_positive[:, 0].reshape(-1, 1), test_positive[:, 1].reshape(-1, 1) + else: + train_positive = self.target.iloc[self.train_edge_ind].values + test_positive = self.target.iloc[self.test_edge_ind].values - def shuffle(X, y): - ind_shuffle = np.arange(0, X.shape[0]) - np.random.shuffle(ind_shuffle) - return X[ind_shuffle], y[ind_shuffle] + X_train, y_train = train_positive[:, 0].reshape(-1, 1).astype(np.int32), train_positive[:, 1].reshape(-1, 1).astype(np.int32) + X_test, y_test = test_positive[:, 0].reshape(-1, 1).astype(np.int32), test_positive[:, 1].reshape(-1, 1).astype(np.int32) self.X_train, self.y_train = shuffle(X_train, y_train) self.X_test, self.y_test = shuffle(X_test, y_test) diff --git a/SourceCodeTools/code/experiments/classifiers.py b/SourceCodeTools/code/experiments/classifiers.py index ff037f9b..3969021f 100644 --- a/SourceCodeTools/code/experiments/classifiers.py +++ b/SourceCodeTools/code/experiments/classifiers.py @@ -1,3 +1,5 @@ +import logging + import tensorflow as tf from tensorflow.keras.layers import Dense, Flatten, Conv2D, Input, Embedding, concatenate @@ -95,6 +97,9 @@ class NodeClassifier(Model): def __init__(self, node_emb_size, n_classes, h_size=None): super(NodeClassifier, self).__init__() + self.proj = Dense(node_emb_size, use_bias=False) + logging.warning("Using projection matrix") + if h_size is None: h_size = [30, 15] @@ -115,7 +120,7 @@ def __init__(self, node_emb_size, n_classes, h_size=None): # self.logits = Dense(n_classes, input_shape=(h_size[1],)) def __call__(self, x, **kwargs): - h = x + h = self.proj(x) for l in self.layers_: h = l(h) return h diff --git a/SourceCodeTools/code/experiments/run_experiment.py b/SourceCodeTools/code/experiments/run_experiment.py index 7c287c8d..46990e9b 100644 --- a/SourceCodeTools/code/experiments/run_experiment.py +++ b/SourceCodeTools/code/experiments/run_experiment.py @@ -1,5 +1,7 @@ import os +from sklearn import metrics + os.environ['CUDA_VISIBLE_DEVICES'] = '-1' os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' @@ -22,6 +24,169 @@ all_experiments = link_prediction_experiments + name_prediction_experiments + node_classification_experiments + name_subword_predictor +class Tracker: + def __init__(self, inverted_index=None): + self.all_true = [] + self.all_estimated = [] + self.all_emb = [] + self.inv_index = inverted_index + + def add(self, embs, pred, true): + self.all_true.append(true) + self.all_estimated.append(pred) + self.all_emb.append(embs) + + def decode_label_names(self, ids): + assert self.inv_index is not None, "Need inverted index" + return list(map(lambda x: self.inv_index[x], ids)) + + @property + def true_labels(self): + return np.concatenate(self.all_true, axis=0).reshape(-1,).tolist() + + @property + def true_label_names(self): + return self.decode_label_names(self.true_labels) + + @property + def pred_labels(self): + return np.argmax(np.concatenate(self.all_estimated, axis=0), axis=-1).reshape(-1, ).tolist() + + @property + def pred_label_names(self): + return self.decode_label_names(self.pred_labels) + + @property + def pred_scores(self): + return np.concatenate(self.all_estimated, axis=0) + + @property + def embeddings(self): + return np.concatenate(self.all_emb, axis=0) + + def clear(self): + self.all_true.clear() + self.all_estimated.clear() + self.all_emb.clear() + + def save_embs_for_tb(self, save_name): + assert self.inv_index is not None, "Cannot export for tensorboard without metadata" + np.savetxt(f"{save_name}_embeddings.tsv", self.embeddings, delimiter="\t") + with open(f"{save_name}_meta.tsv", "w") as meta_sink: + for label in list(map(lambda x: self.inv_index[x], self.true_labels)): + meta_sink.write(f"{label}\n") + + def save_umap(self, save_name): + type_freq = { + "str": 532, + "Optional": 232, + "int": 206, + "Any": 171, + "Union": 156, + "bool": 143, + "Callable": 80, + "Dict": 77, + "bytes": 58, + "float": 48 + } + from umap import UMAP + import matplotlib.pyplot as plt + # plt.rcParams.update({'font.size': 5}) + reducer = UMAP(50) + embedding = reducer.fit_transform(self.embeddings) + + labels = list(map(lambda x: self.inv_index[x], self.true_labels)) + unique_labels = sorted(list(set(labels))) + + plt.figure(figsize=(6,6)) + legend = [] + for label in unique_labels: + if label not in type_freq: + continue + xs = [] + ys = [] + for lbl, (x, y) in zip(labels, embedding): + if lbl == label: + xs.append(x) + ys.append(y) + plt.scatter(xs, ys, 1.) + legend.append(label) + plt.axis('off') + plt.legend(legend) + plt.savefig(f"{save_name}_umap.pdf") + plt.close() + # plt.show() + + + + def get_metrics(self): + all_true = self.true_labels + all_scores = self.pred_scores + + metric_dict = {} + + for k in [1,3,5]: + metric_dict[f"Acc@{k}"] = metrics.top_k_accuracy_score(y_true=all_true, y_score=all_scores, k=k, labels=list(range(all_scores.shape[1]))) + + return metric_dict + + def save_confusion_matrix(self, save_path): + estimate_confusion( + self.pred_label_names, + self.true_label_names, + save_path=save_path + ) + + +def estimate_confusion(pred, true, save_path): + pred_filtered = pred + true_filtered = true + + import matplotlib.pyplot as plt + import matplotlib.pyplot as plt + from matplotlib.pyplot import cm + + # plt.rcParams.update({'font.size': 50}) + + labels = sorted(list(set(true_filtered + pred_filtered))) + label2ind = dict(zip(labels, range(len(labels)))) + + confusion = np.zeros((len(labels), len(labels))) + + for pred, true in zip(pred_filtered, true_filtered): + confusion[label2ind[true], label2ind[pred]] += 1 + + norm = np.array([x if x != 0 else 1. for x in np.sum(confusion, axis=1)]).reshape(-1,1) + confusion /= norm + + + fig, ax = plt.subplots(figsize=(6,6)) + im = ax.imshow(confusion, interpolation='nearest', cmap=cm.Blues) + + # We want to show all ticks... + ax.set_xticks(np.arange(len(labels))) + ax.set_yticks(np.arange(len(labels))) + # ... and label them with the respective list entries + ax.set_xticklabels(labels) + ax.set_yticklabels(labels) + + # Rotate the tick labels and set their alignment. + plt.setp(ax.get_xticklabels(), rotation=45, ha="right", + rotation_mode="anchor") + + # Loop over data dimensions and create text annotations. + for i in range(len(labels)): + for j in range(len(labels)): + text = ax.text(j, i, f"{confusion[i, j]: .2f}", + ha="center", va="center", color="w", fontsize="xx-small") + + # ax.set_title("Confusion matrix for Python type prediction") + fig.tight_layout() + # plt.show() + plt.savefig(os.path.join(save_path, "confusion.pdf")) + plt.close() + + def run_experiment(e, experiment_name, args): experiment = e[experiment_name] @@ -70,8 +235,8 @@ def run_experiment(e, experiment_name, args): test_loss = tf.keras.metrics.Mean(name='test_loss') test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy') - @tf.function - def train_step(batch): + # @tf.function + def train_step(batch, tracker=None): with tf.GradientTape() as tape: # training=True is only needed if there are layers with different # behavior during training versus inference (e.g. Dropout). @@ -80,22 +245,30 @@ def train_step(batch): gradients = tape.gradient(loss, clf.trainable_variables) optimizer.apply_gradients(zip(gradients, clf.trainable_variables)) + if tracker is not None: + tracker.add(batch["x"], predictions.numpy(), batch["y"]) + train_loss(loss) train_accuracy(batch["y"], predictions) - @tf.function - def test_step(batch): + # @tf.function + def test_step(batch, tracker=None): # training=False is only needed if there are layers with different # behavior during training versus inference (e.g. Dropout). predictions = clf(**batch, training=False) t_loss = loss_object(batch["y"], predictions) + if tracker is not None: + tracker.add(batch["x"], predictions.numpy(), batch["y"]) + test_loss(t_loss) test_accuracy(batch["y"], predictions) - args.epochs = 500 - + trains = [] tests = [] + metrics = [] + + test_tracker = Tracker(inverted_index=experiment.inv_index if hasattr(experiment, "inv_index") else None) for epoch in range(args.epochs): # Reset the metrics at the start of the next epoch @@ -108,12 +281,17 @@ def test_step(batch): train_step(batch) if epoch % 1 == 0: + + test_tracker.clear() + for batch in experiment.test_batches(): - test_step(batch) + test_step(batch, tracker=test_tracker) ma_train = train_accuracy.result() * 100 * ma_alpha + ma_train * (1 - ma_alpha) ma_test = test_accuracy.result() * 100 * ma_alpha + ma_test * (1 - ma_alpha) + trains.append(ma_train) tests.append(ma_test) + metrics.append(test_tracker.get_metrics()) # template = 'Epoch {}, Loss: {:.4f}, Accuracy: {:.4f}, Test Loss: {:.4f}, Test Accuracy: {:.4f}, Average Test {:.4f}' # print(template.format(epoch+1, @@ -123,9 +301,26 @@ def test_step(batch): # test_accuracy.result()*100, # ma_test)) + # plot confusion matrix + if hasattr(experiment, "inv_index") and args.out_path is not None: + test_tracker.save_confusion_matrix(save_path=args.out_path) + + if hasattr(experiment, "inv_index") and args.emb_out: + test_tracker.save_embs_for_tb(save_name=os.path.join(args.out_path, "tb")) + test_tracker.save_umap(save_name=os.path.join(args.out_path, "tb")) + + print(metrics[tests.index(max(tests))]) + # ma_train = train_accuracy.result() * 100 * ma_alpha + ma_train * (1 - ma_alpha) # ma_test = test_accuracy.result() * 100 * ma_alpha + ma_test * (1 - ma_alpha) + import matplotlib.pyplot as plt + plt.plot(trains) + plt.plot(tests) + plt.legend(["Train", "Test"]) + # plt.savefig(os.path.join(args.base_path, f"{args.experiment}.png")) + plt.show() + return ma_train, max(tests) @@ -151,30 +346,43 @@ def test_step(batch): parser.add_argument("--element_predictor_h_size", default=50, type=int, help="") parser.add_argument("--link_predictor_h_size", default="[20]", type=str, help="") parser.add_argument("--node_classifier_h_size", default="[30,15]", type=str, help="") + parser.add_argument("--out_path", default=None, type=str, help="") + # parser.add_argument("--confusion_out_path", default=None, type=str, help="") + parser.add_argument("--trials", default=1, type=int, help="") + parser.add_argument("--emb_out", default=False, action="store_true", help="") + parser.add_argument("--only_popular_types", default=False, action="store_true", help="") + parser.add_argument("--popular_types", default="str,Optional,int,Any,Union,bool,Other,Callable,Dict,bytes,float,Description,List,Sequence,Namespace,T,Type,object,HTTPServerRequest,Future,Matcher", help="") + # parser.add_argument("--emb_out", default=None, type=str, help="") + parser.add_argument("--embeddings", default=None) parser.add_argument('--random', action='store_true') parser.add_argument('--test_embedder', action='store_true') args = parser.parse_args() + if args.out_path is None: + args.out_path = os.path.dirname(args.embeddings) + + if args.popular_types is not None: + args.popular_types = args.popular_types.split(",") + print(args.__dict__) - e = Experiments(base_path=args.base_path, - api_seq_path=args.api_seq, - type_link_path=args.type_link, - type_link_train_path=args.type_link_train, - type_link_test_path=args.type_link_test, - variable_use_path=args.var_use, # not needed - function_name_path=None, - type_ann=args.type_ann, - gnn_layer=-1, - ) + e = Experiments( + base_path=args.base_path, api_seq_path=args.api_seq, type_link_path=args.type_link, + type_link_train_path=args.type_link_train, type_link_test_path=args.type_link_test, + variable_use_path=args.var_use, # not needed + function_name_path=None, + type_ann=args.type_ann, gnn_layer=-1, embeddings_path=args.embeddings, + only_popular_types=args.only_popular_types, popular_types=args.popular_types + ) experiments = args.experiment.split(",") for experiment_name in experiments: - print(f"\n{experiment_name}:") - try: - train_acc, test_acc = run_experiment(e, experiment_name, args) - print("Train Accuracy: {:.4f}, Test Accuracy: {:.4f}".format(train_acc, test_acc)) - except ValueError as err: - print(err) - print("\n") + for trial in range(args.trials): + print(f"\n{experiment_name}, trial {trial}:") + try: + train_acc, test_acc = run_experiment(e, experiment_name, args) + print(f"Train Accuracy: {train_acc:.4f}, Test Accuracy: {test_acc:.4f}") + except ValueError as err: + print(err) + print("\n") diff --git a/SourceCodeTools/mltools/torch/__init__.py b/SourceCodeTools/mltools/torch/__init__.py new file mode 100644 index 00000000..47e7becd --- /dev/null +++ b/SourceCodeTools/mltools/torch/__init__.py @@ -0,0 +1,5 @@ +import torch + + +def compute_accuracy(pred_, true_): + return torch.sum(pred_ == true_).item() / len(true_) \ No newline at end of file diff --git a/SourceCodeTools/models/graph/ElementEmbedder.py b/SourceCodeTools/models/graph/ElementEmbedder.py index 56b111a7..4ade9c23 100644 --- a/SourceCodeTools/models/graph/ElementEmbedder.py +++ b/SourceCodeTools/models/graph/ElementEmbedder.py @@ -8,17 +8,25 @@ from SourceCodeTools.models.graph.ElementEmbedderBase import ElementEmbedderBase from SourceCodeTools.models.graph.train.Scorer import Scorer -from SourceCodeTools.models.nlp.Encoder import LSTMEncoder, Encoder +from SourceCodeTools.models.nlp.TorchEncoder import LSTMEncoder, Encoder class GraphLinkSampler(ElementEmbedderBase, Scorer): - def __init__(self, elements, nodes, compact_dst=True, dst_to_global=True, emb_size=None): + def __init__( + self, elements, nodes, compact_dst=True, dst_to_global=True, emb_size=None, device="cpu", + method="inner_prod", nn_index="brute", ns_groups=None + ): assert emb_size is not None - ElementEmbedderBase.__init__(self, elements=elements, nodes=nodes, compact_dst=compact_dst, dst_to_global=dst_to_global) - Scorer.__init__(self, num_embs=len(self.elements["dst"].unique()), emb_size=emb_size, src2dst=self.element_lookup) + ElementEmbedderBase.__init__( + self, elements=elements, nodes=nodes, compact_dst=compact_dst, dst_to_global=dst_to_global + ) + Scorer.__init__( + self, num_embs=len(self.elements["dst"].unique()), emb_size=emb_size, src2dst=self.element_lookup, + device=device, method=method, index_backend=nn_index, ns_groups=ns_groups + ) def sample_negative(self, size, ids=None, strategy="closest"): - if strategy == "w2v": + if strategy == "w2v" or self.scorer_index is None: negative = ElementEmbedderBase.sample_negative(self, size) else: negative = Scorer.sample_closest_negative(self, ids, k=size // len(ids)) @@ -26,10 +34,12 @@ def sample_negative(self, size, ids=None, strategy="closest"): return negative -class ElementEmbedder(ElementEmbedderBase, nn.Module): +class ElementEmbedder(ElementEmbedderBase, nn.Module, Scorer): def __init__(self, elements, nodes, emb_size, compact_dst=True): ElementEmbedderBase.__init__(self, elements=elements, nodes=nodes, compact_dst=compact_dst) nn.Module.__init__(self) + Scorer.__init__(self, num_embs=len(self.elements["dst"].unique()), emb_size=emb_size, + src2dst=self.element_lookup) self.emb_size = emb_size n_elems = self.elements['emb_id'].unique().size @@ -39,9 +49,30 @@ def __init__(self, elements, nodes, emb_size, compact_dst=True): def __getitem__(self, ids): return torch.LongTensor(ElementEmbedderBase.__getitem__(self, ids=ids)) + def sample_negative(self, size, ids=None, strategy="closest"): + # TODO + # Try other distributions + if strategy == "w2v": + negative = ElementEmbedderBase.sample_negative(self, size) + else: + ### negative = random.choices(Scorer.sample_closest_negative(self, ids), k=size) + negative = Scorer.sample_closest_negative(self, ids, k=size // len(ids)) + assert len(negative) == size + + return torch.LongTensor(negative) + def forward(self, input, **kwargs): return self.norm(self.embed(input)) + def set_embed(self): + all_keys = self.get_keys_for_scoring() + with torch.set_grad_enabled(False): + self.scorer_all_emb = self(torch.LongTensor(all_keys).to(self.embed.weight.device)).detach().cpu().numpy() + + def prepare_index(self): + self.set_embed() + Scorer.prepare_index(self) + # def hashstr(s, num_buckets): # return int(hashlib.md5(s.encode('utf8')).hexdigest(), 16) % num_buckets diff --git a/SourceCodeTools/models/graph/ElementEmbedderBase.py b/SourceCodeTools/models/graph/ElementEmbedderBase.py index 50d829b5..2b2727bd 100644 --- a/SourceCodeTools/models/graph/ElementEmbedderBase.py +++ b/SourceCodeTools/models/graph/ElementEmbedderBase.py @@ -16,7 +16,8 @@ def __init__(self, elements, nodes, compact_dst=True, dst_to_global=False): def init(self, compact_dst): if compact_dst: elem2id, self.inverse_dst_map = compact_property(self.elements['dst'], return_order=True) - self.elements['emb_id'] = self.elements['dst'].apply(lambda x: elem2id[x]) + self.elements['emb_id'] = self.elements['dst'].apply(lambda x: elem2id.get(x, -1)) + assert -1 not in self.elements['emb_id'].tolist() else: self.elements['emb_id'] = self.elements['dst'] @@ -49,7 +50,11 @@ def preprocess_element_data(self, element_data, nodes, compact_dst, dst_to_globa id2typedid = dict(zip(nodes['id'].tolist(), nodes['typed_id'].tolist())) id2type = dict(zip(nodes['id'].tolist(), nodes['type'].tolist())) - def get_node_pools(element_data): + self.id2nodeid = id2nodeid + self.id2typedid = id2typedid + self.id2type = id2type + + def get_node_pools(element_data): # create typed node list for possible use with dgl node_typed_pools = {} for orig_node_id in element_data['src']: global_id = id2nodeid.get(orig_node_id, None) @@ -128,9 +133,9 @@ def get_src_pool(self, ntypes=None): # return {ntype: set(self.elements.query(f"src_type == '{ntype}'")['src_typed_id'].tolist()) for ntype in ntypes} def _create_pools(self, train_idx, val_idx, test_idx, pool) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: - train_idx = np.fromiter(pool.intersection(train_idx.tolist()), dtype=np.int64) - val_idx = np.fromiter(pool.intersection(val_idx.tolist()), dtype=np.int64) - test_idx = np.fromiter(pool.intersection(test_idx.tolist()), dtype=np.int64) + train_idx = np.fromiter(pool.intersection(train_idx.reshape((-1,)).tolist()), dtype=np.int64) + val_idx = np.fromiter(pool.intersection(val_idx.reshape((-1,)).tolist()), dtype=np.int64) + test_idx = np.fromiter(pool.intersection(test_idx.reshape((-1,)).tolist()), dtype=np.int64) return train_idx, val_idx, test_idx def create_idx_pools(self, train_idx, val_idx, test_idx): diff --git a/SourceCodeTools/models/graph/LinkPredictor.py b/SourceCodeTools/models/graph/LinkPredictor.py index 340803c6..5320e772 100644 --- a/SourceCodeTools/models/graph/LinkPredictor.py +++ b/SourceCodeTools/models/graph/LinkPredictor.py @@ -49,21 +49,74 @@ class BilinearLinkPedictor(nn.Module): def __init__(self, embedding_dim_1, embedding_dim_2, target_classes=2): super(BilinearLinkPedictor, self).__init__() - self.bilinear = nn.Bilinear(embedding_dim_1, embedding_dim_2, target_classes) + self.l1 = nn.Linear(embedding_dim_1, 300) + self.l2 = nn.Linear(embedding_dim_2, 300) + self.act = nn.Sigmoid() + self.bilinear = nn.Bilinear(300, 300, target_classes) def forward(self, x1, x2): - return self.bilinear(x1, x2) + return self.bilinear(self.act(self.l1(x1)), self.act(self.l2(x2))) class CosineLinkPredictor(nn.Module): - def __init__(self): + def __init__(self, margin=0.): + """ + Dummy link predictor, using to keep API the same + """ super(CosineLinkPredictor, self).__init__() self.cos = nn.CosineSimilarity() - self.max_margin = torch.Tensor([0.4]) + self.max_margin = torch.Tensor([margin])[0] def forward(self, x1, x2): if self.max_margin.device != x1.device: self.max_margin = self.max_margin.to(x1.device) + # this will not train logit = (self.cos(x1, x2) > self.max_margin).float().unsqueeze(1) - return torch.cat([1 - logit, logit], dim=1) \ No newline at end of file + return torch.cat([1 - logit, logit], dim=1) + + +class L2LinkPredictor(nn.Module): + def __init__(self, margin=1.): + super(L2LinkPredictor, self).__init__() + self.margin = torch.Tensor([margin])[0] + + def forward(self, x1, x2): + if self.margin.device != x1.device: + self.margin = self.margin.to(x1.device) + # this will not train + logit = (torch.norm(x1 - x2, dim=-1, keepdim=True) < self.margin).float() + return torch.cat([1 - logit, logit], dim=1) + + +class TransRLinkPredictor(nn.Module): + def __init__(self, input_dim, rel_dim, num_relations, margin=0.3): + super(TransRLinkPredictor, self).__init__() + self.rel_dim = rel_dim + self.input_dim = input_dim + self.margin = margin + + self.rel_emb = nn.Embedding(num_embeddings=num_relations, embedding_dim=rel_dim) + self.proj_matr = nn.Embedding(num_embeddings=num_relations, embedding_dim=input_dim * rel_dim) + self.triplet_loss = nn.TripletMarginLoss(margin=margin) + + def forward(self, a, p, n, labels): + weights = self.proj_matr(labels).reshape((-1, self.rel_dim, self.input_dim)) + rels = self.rel_emb(labels) + m_a = (weights * a.unsqueeze(1)).sum(-1) + m_p = (weights * p.unsqueeze(1)).sum(-1) + m_n = (weights * n.unsqueeze(1)).sum(-1) + + transl = m_a + rels + + sim = torch.norm(torch.cat([transl - m_p, transl - m_n], dim=0), dim=-1) + + # pos_diff = torch.norm(transl - m_p, dim=-1) + # neg_diff = torch.norm(transl - m_n, dim=-1) + + # loss = pos_diff + torch.maximum(torch.tensor([0.]).to(neg_diff.device), self.margin - neg_diff) + # return loss.mean(), sim + return self.triplet_loss(transl, m_p, m_n), sim < self.margin + + + diff --git a/SourceCodeTools/models/graph/NodeEmbedder.py b/SourceCodeTools/models/graph/NodeEmbedder.py index c6149723..28ab6015 100644 --- a/SourceCodeTools/models/graph/NodeEmbedder.py +++ b/SourceCodeTools/models/graph/NodeEmbedder.py @@ -7,6 +7,9 @@ class NodeEmbedder(nn.Module): def __init__(self, nodes, emb_size, dtype=None, n_buckets=500000, pretrained=None): super(NodeEmbedder, self).__init__() + self.init(nodes, emb_size, dtype, n_buckets, pretrained) + + def init(self, nodes, emb_size, dtype=None, n_buckets=500000, pretrained=None): self.emb_size = emb_size self.dtype = dtype if dtype is None: @@ -15,11 +18,13 @@ def __init__(self, nodes, emb_size, dtype=None, n_buckets=500000, pretrained=Non self.buckets = None + embedding_field = "embeddable_name" + nodes_with_embeddings = nodes.query("embeddable == True")[ - ['global_graph_id', 'typed_id', 'type', 'type_backup', 'name'] + ['global_graph_id', 'typed_id', 'type', 'type_backup', embedding_field] ] - type_name = list(zip(nodes_with_embeddings['type_backup'], nodes_with_embeddings['name'])) + type_name = list(zip(nodes_with_embeddings['type_backup'], nodes_with_embeddings[embedding_field])) self.node_info = dict(zip( list(zip(nodes_with_embeddings['type'], nodes_with_embeddings['typed_id'])), @@ -39,7 +44,7 @@ def __init__(self, nodes, emb_size, dtype=None, n_buckets=500000, pretrained=Non self._create_buckets_from_pretrained(pretrained) def _create_buckets(self): - self.buckets = nn.Embedding(self.n_buckets + 1, self.emb_size, padding_idx=self.n_buckets) + self.buckets = nn.Embedding(self.n_buckets + 1, self.emb_size, padding_idx=self.n_buckets, sparse=True) def _create_buckets_from_pretrained(self, pretrained): @@ -49,7 +54,7 @@ def _create_buckets_from_pretrained(self, pretrained): weights_with_pad = torch.tensor(np.vstack([pretrained, np.zeros((1, self.emb_size), dtype=np.float32)])) - self.buckets = nn.Embedding.from_pretrained(weights_with_pad, freeze=False, padding_idx=self.n_buckets) + self.buckets = nn.Embedding.from_pretrained(weights_with_pad, freeze=False, padding_idx=self.n_buckets, sparse=True) def _get_embedding_from_node_info(self, keys, node_info, masked=None): idxs = [] @@ -93,6 +98,42 @@ def forward(self, node_type=None, node_ids=None, train_embeddings=True, masked=N return self.get_embeddings(node_type, node_ids.tolist(), masked=masked) +class NodeIdEmbedder(NodeEmbedder): + def __init__(self, nodes=None, emb_size=None, dtype=None, n_buckets=500000, pretrained=None): + super(NodeIdEmbedder, self).__init__(nodes, emb_size, dtype, n_buckets, pretrained) + + def init(self, nodes, emb_size, dtype=None, n_buckets=500000, pretrained=None): + self.emb_size = emb_size + self.dtype = dtype + if dtype is None: + self.dtype = torch.float32 + self.n_buckets = n_buckets + + self.buckets = None + + embedding_field = "embeddable_name" + + nodes_with_embeddings = nodes.query("embeddable == True")[ + ['global_graph_id', 'typed_id', 'type', 'type_backup', embedding_field] + ] + + self.to_global_map = {} + for global_graph_id, typed_id, type_, type_backup, name in nodes_with_embeddings.values: + if type_ not in self.to_global_map: + self.to_global_map[type_] = {} + + self.to_global_map[type_][typed_id] = global_graph_id + + self._create_buckets() + + def get_embeddings(self, node_type=None, node_ids=None, masked=None): + assert node_ids is not None + if node_type is not None: + node_ids = list(map(lambda local_id: self.to_global_map[node_type][local_id], node_ids)) + + return self.buckets(torch.LongTensor(node_ids)) + + # class SimpleNodeEmbedder(nn.Module): # def __init__(self, dataset, emb_size, dtype=None, n_buckets=500000, pretrained=None): # super(SimpleNodeEmbedder, self).__init__() diff --git a/SourceCodeTools/models/graph/basis_gatconv.py b/SourceCodeTools/models/graph/basis_gatconv.py new file mode 100644 index 00000000..641f7ba9 --- /dev/null +++ b/SourceCodeTools/models/graph/basis_gatconv.py @@ -0,0 +1,237 @@ +from dgl import DGLError +from dgl.nn.functional import edge_softmax +from dgl.nn.pytorch import GATConv +from dgl.nn.pytorch.utils import Identity +from dgl.utils import expand_as_pair +import dgl.function as fn +from torch import nn, softmax +import torch as th +from torch.utils import checkpoint + + +class BasisGATConv(GATConv): + """ + Does not seem to improve memory requirements + """ + def __init__(self, + in_feats, + out_feats, + num_heads, + basis, + attn_basis, + basis_coef, + feat_drop=0., + attn_drop=0., + negative_slope=0.2, + residual=False, + activation=None, + allow_zero_in_degree=False, + bias=True, + use_checkpoint=False): + super(GATConv, self).__init__() + self._basis = basis + self._basis_coef = basis_coef + self._attn_basis = attn_basis + + self._num_heads = num_heads + self._in_src_feats, self._in_dst_feats = expand_as_pair(in_feats) + self._out_feats = out_feats + self._allow_zero_in_degree = allow_zero_in_degree + # if isinstance(in_feats, tuple): + # self.fc_src = nn.Linear( + # self._in_src_feats, out_feats * num_heads, bias=False) + # self.fc_dst = nn.Linear( + # self._in_dst_feats, out_feats * num_heads, bias=False) + # else: + # self.fc = nn.Linear( + # self._in_src_feats, out_feats * num_heads, bias=False) + # self.attn_l = nn.Parameter(th.FloatTensor(size=(1, num_heads, out_feats))) + # self.attn_r = nn.Parameter(th.FloatTensor(size=(1, num_heads, out_feats))) + self.feat_drop = nn.Dropout(feat_drop) + self.attn_drop = nn.Dropout(attn_drop) + self.leaky_relu = nn.LeakyReLU(negative_slope) + if bias: + self.bias = nn.Parameter(th.FloatTensor(size=(num_heads * out_feats,))) + else: + self.register_buffer('bias', None) + if residual: + if self._in_dst_feats != out_feats: + self.res_fc = nn.Linear( + self._in_dst_feats, num_heads * out_feats, bias=False) + else: + self.res_fc = Identity() + else: + self.register_buffer('res_fc', None) + self.reset_parameters() + self.activation = activation + + self.dummy_tensor = th.ones(1, dtype=th.float32, requires_grad=True) + self.use_checkpoint = use_checkpoint + + def reset_parameters(self): + """ + + Description + ----------- + Reinitialize learnable parameters. + + Note + ---- + The fc weights :math:`W^{(l)}` are initialized using Glorot uniform initialization. + The attention weights are using xavier initialization method. + """ + gain = nn.init.calculate_gain('relu') + # if hasattr(self, 'fc'): + # nn.init.xavier_normal_(self.fc.weight, gain=gain) + # else: + # nn.init.xavier_normal_(self.fc_src.weight, gain=gain) + # nn.init.xavier_normal_(self.fc_dst.weight, gain=gain) + # nn.init.xavier_normal_(self.attn_l, gain=gain) + # nn.init.xavier_normal_(self.attn_r, gain=gain) + nn.init.constant_(self.bias, 0) + if isinstance(self.res_fc, nn.Linear): + nn.init.xavier_normal_(self.res_fc.weight, gain=gain) + + def set_allow_zero_in_degree(self, set_value): + r""" + + Description + ----------- + Set allow_zero_in_degree flag. + + Parameters + ---------- + set_value : bool + The value to be set to the flag. + """ + self._allow_zero_in_degree = set_value + + + def _forward(self, graph, feat, get_attention=False): + r""" + + Description + ----------- + Compute graph attention network layer. + + Parameters + ---------- + graph : DGLGraph + The graph. + feat : torch.Tensor or pair of torch.Tensor + If a torch.Tensor is given, the input feature of shape :math:`(N, D_{in})` where + :math:`D_{in}` is size of input feature, :math:`N` is the number of nodes. + If a pair of torch.Tensor is given, the pair must contain two tensors of shape + :math:`(N_{in}, D_{in_{src}})` and :math:`(N_{out}, D_{in_{dst}})`. + get_attention : bool, optional + Whether to return the attention values. Default to False. + + Returns + ------- + torch.Tensor + The output feature of shape :math:`(N, H, D_{out})` where :math:`H` + is the number of heads, and :math:`D_{out}` is size of output feature. + torch.Tensor, optional + The attention values of shape :math:`(E, H, 1)`, where :math:`E` is the number of + edges. This is returned only when :attr:`get_attention` is ``True``. + + Raises + ------ + DGLError + If there are 0-in-degree nodes in the input graph, it will raise DGLError + since no message will be passed to those nodes. This will cause invalid output. + The error can be ignored by setting ``allow_zero_in_degree`` parameter to ``True``. + """ + with graph.local_scope(): + if not self._allow_zero_in_degree: + if (graph.in_degrees() == 0).any(): + raise DGLError('There are 0-in-degree nodes in the graph, ' + 'output for those nodes will be invalid. ' + 'This is harmful for some applications, ' + 'causing silent performance regression. ' + 'Adding self-loop on the input graph by ' + 'calling `g = dgl.add_self_loop(g)` will resolve ' + 'the issue. Setting ``allow_zero_in_degree`` ' + 'to be `True` when constructing this module will ' + 'suppress the check and let the code run.') + + if isinstance(feat, tuple): + h_src = self.feat_drop(feat[0]) + h_dst = self.feat_drop(feat[1]) + basis_coef = softmax(self._basis_coef, dim=-1).reshape(-1, 1, 1) + # if not hasattr(self, 'fc_src'): + params_src = (self._basis[0] * basis_coef).sum(dim=0) + params_dst = (self._basis[1] * basis_coef).sum(dim=0) + feat_src = (params_src @ h_src.T).view(-1, self._num_heads, self._out_feats) + feat_dst = (params_dst @ h_dst.T).view(-1, self._num_heads, self._out_feats) + # # feat_src = self.fc(h_src).view(-1, self._num_heads, self._out_feats) + # # feat_dst = self.fc(h_dst).view(-1, self._num_heads, self._out_feats) + # else: + # params = self._basis * basis_coef + # feat_src = (params @ h_src.T).view(-1, self._num_heads, self._out_feats) + # feat_dst = (params @ h_dst.T).view(-1, self._num_heads, self._out_feats) + # # feat_src = self.fc_src(h_src).view(-1, self._num_heads, self._out_feats) + # # feat_dst = self.fc_dst(h_dst).view(-1, self._num_heads, self._out_feats) + else: + h_src = h_dst = self.feat_drop(feat) + basis_coef = softmax(self._basis_coef, dim=-1).reshape(-1, 1, 1) + params = (self._basis * basis_coef).sum(dim=0) + feat_src = feat_dst = (params @ h_src.T).view(-1, self._num_heads, self._out_feats) + # feat_src = feat_dst = self.fc(h_src).view( + # -1, self._num_heads, self._out_feats) + if graph.is_block: + feat_dst = feat_src[:graph.number_of_dst_nodes()] + # NOTE: GAT paper uses "first concatenation then linear projection" + # to compute attention scores, while ours is "first projection then + # addition", the two approaches are mathematically equivalent: + # We decompose the weight vector a mentioned in the paper into + # [a_l || a_r], then + # a^T [Wh_i || Wh_j] = a_l Wh_i + a_r Wh_j + # Our implementation is much efficient because we do not need to + # save [Wh_i || Wh_j] on edges, which is not memory-efficient. Plus, + # addition could be optimized with DGL's built-in function u_add_v, + # which further speeds up computation and saves memory footprint. + attn_l_param = (self._attn_basis[0] * basis_coef).sum(dim=0) + attn_r_param = (self._attn_basis[1] * basis_coef).sum(dim=0) + el = (feat_src * attn_l_param).sum(dim=-1).unsqueeze(-1) + er = (feat_dst * attn_r_param).sum(dim=-1).unsqueeze(-1) + # el = (feat_src * self.attn_l).sum(dim=-1).unsqueeze(-1) + # er = (feat_dst * self.attn_r).sum(dim=-1).unsqueeze(-1) + graph.srcdata.update({'ft': feat_src, 'el': el}) + graph.dstdata.update({'er': er}) + # compute edge attention, el and er are a_l Wh_i and a_r Wh_j respectively. + graph.apply_edges(fn.u_add_v('el', 'er', 'e')) + e = self.leaky_relu(graph.edata.pop('e')) + # compute softmax + graph.edata['a'] = self.attn_drop(edge_softmax(graph, e)) + # message passing + graph.update_all(fn.u_mul_e('ft', 'a', 'm'), + fn.sum('m', 'ft')) + rst = graph.dstdata['ft'] + # residual + if self.res_fc is not None: + resval = self.res_fc(h_dst).view(h_dst.shape[0], self._num_heads, self._out_feats) + rst = rst + resval + # bias + if self.bias is not None: + rst = rst + self.bias.view(1, self._num_heads, self._out_feats) + # activation + if self.activation: + rst = self.activation(rst) + + if get_attention: + return rst, graph.edata['a'] + else: + return rst + + def custom(self, graph, get_attention): + def custom_forward(*inputs): + feat0, feat1, dummy = inputs + return self._forward(graph, (feat0, feat1), get_attention=get_attention) + return custom_forward + + def forward(self, graph, feat, get_attention=False): + if self.use_checkpoint: + return checkpoint.checkpoint(self.custom(graph, get_attention), feat[0], feat[1], self.dummy_tensor).squeeze(1) + else: + return self._forward(graph, feat, get_attention=get_attention).squeeze(1) \ No newline at end of file diff --git a/SourceCodeTools/models/graph/ggnn.py b/SourceCodeTools/models/graph/ggnn.py index a0716fbf..f082f6ed 100644 --- a/SourceCodeTools/models/graph/ggnn.py +++ b/SourceCodeTools/models/graph/ggnn.py @@ -4,7 +4,7 @@ import dgl.function as fn from dgl.nn.pytorch.conv import GatedGraphConv -from SourceCodeTools.code.data.sourcetrail.Dataset import compact_property +from SourceCodeTools.code.data.dataset.Dataset import compact_property from SourceCodeTools.models.Embedder import Embedder diff --git a/SourceCodeTools/models/graph/rgcn_sampling.py b/SourceCodeTools/models/graph/rgcn_sampling.py index 42ed4536..ae71a788 100644 --- a/SourceCodeTools/models/graph/rgcn_sampling.py +++ b/SourceCodeTools/models/graph/rgcn_sampling.py @@ -7,6 +7,7 @@ import dgl.nn as dglnn # import tqdm from torch.utils import checkpoint +from tqdm import tqdm from SourceCodeTools.models.Embedder import Embedder @@ -38,9 +39,9 @@ def custom_forward(*inputs): def forward(self, graph, feat): if self.use_checkpoint: - return checkpoint.checkpoint(self.custom(graph), feat[0], feat[1], self.dummy_tensor) + return checkpoint.checkpoint(self.custom(graph), feat[0], feat[1], self.dummy_tensor) #.squeeze(1) else: - return super(CkptGATConv, self).forward(graph, feat) + return super(CkptGATConv, self).forward(graph, feat) #.squeeze(1) class RelGraphConvLayer(nn.Module): @@ -71,6 +72,7 @@ def __init__(self, in_feat, out_feat, rel_names, + ntype_names, num_bases, *, weight=True, @@ -82,18 +84,13 @@ def __init__(self, self.in_feat = in_feat self.out_feat = out_feat self.rel_names = rel_names + self.ntype_names = ntype_names self.num_bases = num_bases self.bias = bias self.activation = activation self.self_loop = self_loop self.use_gcn_checkpoint = use_gcn_checkpoint - # TODO - # think of possibility switching to GAT - # rel : dglnn.GATConv(in_feat, out_feat, num_heads=4) - # rel : dglnn.GraphConv(in_feat, out_feat, norm='right', weight=False, bias=False, allow_zero_in_degree=True) - self.create_conv(in_feat, out_feat, rel_names) - self.use_weight = weight self.use_basis = num_bases < len(self.rel_names) and weight if self.use_weight: @@ -104,17 +101,33 @@ def __init__(self, # nn.init.xavier_uniform_(self.weight, gain=nn.init.calculate_gain('relu')) nn.init.xavier_normal_(self.weight) + # TODO + # think of possibility switching to GAT + # rel : dglnn.GATConv(in_feat, out_feat, num_heads=4) + # rel : dglnn.GraphConv(in_feat, out_feat, norm='right', weight=False, bias=False, allow_zero_in_degree=True) + self.create_conv(in_feat, out_feat, rel_names) + # bias if bias: - self.h_bias = nn.Parameter(th.Tensor(out_feat)) - nn.init.zeros_(self.h_bias) + self.bias_dict = nn.ParameterDict() + for ntype_name in self.ntype_names: + self.bias_dict[ntype_name] = nn.Parameter(torch.Tensor(1, out_feat)) + nn.init.normal_(self.bias_dict[ntype_name]) + # self.h_bias = nn.Parameter(th.Tensor(1, out_feat)) + # nn.init.normal_(self.h_bias) # weight for self loop if self.self_loop: + # if self.use_basis: + # self.loop_weight_basis = dglnn.WeightBasis((in_feat, out_feat), num_bases, len(self.ntype_names)) + # else: + # self.loop_weight = nn.Parameter(th.Tensor(len(self.ntype_names), in_feat, out_feat)) + # # nn.init.xavier_uniform_(self.weight, gain=nn.init.calculate_gain('relu')) + # nn.init.xavier_normal_(self.loop_weight) self.loop_weight = nn.Parameter(th.Tensor(in_feat, out_feat)) - # nn.init.xavier_uniform_(self.loop_weight, - # gain=nn.init.calculate_gain('relu')) - nn.init.xavier_normal_(self.loop_weight) + nn.init.xavier_uniform_(self.loop_weight, + gain=nn.init.calculate_gain('tanh')) + # # nn.init.xavier_normal_(self.loop_weight) self.dropout = nn.Dropout(dropout) @@ -145,6 +158,10 @@ def forward(self, g, inputs): weight = self.basis() if self.use_basis else self.weight wdict = {self.rel_names[i] : {'weight' : w.squeeze(0)} for i, w in enumerate(th.split(weight, 1, dim=0))} + # if self.self_loop: + # self_loop_weight = self.loop_weight_basis() if self.use_basis else self.loop_weight + # self_loop_wdict = {self.ntype_names[i]: w.squeeze(0) + # for i, w in enumerate(th.split(self_loop_weight, 1, dim=0))} else: wdict = {} @@ -160,9 +177,11 @@ def forward(self, g, inputs): def _apply(ntype, h): if self.self_loop: + # h = h + th.matmul(inputs_dst[ntype], self_loop_wdict[ntype]) h = h + th.matmul(inputs_dst[ntype], self.loop_weight) if self.bias: - h = h + self.h_bias + h = h + self.bias_dict[ntype] + # h = h + self.h_bias if self.activation: h = self.activation(h) return self.dropout(h) @@ -171,47 +190,47 @@ def _apply(ntype, h): # return {ntype: _apply(ntype, h) for ntype, h in hs.items()} return {ntype : _apply(ntype, h).mean(1) for ntype, h in hs.items()} -class RelGraphEmbed(nn.Module): - r"""Embedding layer for featureless heterograph.""" - def __init__(self, - g, - embed_size, - embed_name='embed', - activation=None, - dropout=0.0): - super(RelGraphEmbed, self).__init__() - self.g = g - self.embed_size = embed_size - self.embed_name = embed_name - # self.activation = activation - # self.dropout = nn.Dropout(dropout) - - # create weight embeddings for each node for each relation - self.embeds = nn.ParameterDict() - for ntype in g.ntypes: - embed = nn.Parameter(th.Tensor(g.number_of_nodes(ntype), self.embed_size)) - # TODO - # watch for activation in init - # nn.init.xavier_uniform_(embed, gain=nn.init.calculate_gain('relu')) - nn.init.xavier_normal_(embed) - self.embeds[ntype] = embed - - def forward(self, block=None): - """Forward computation - - Parameters - ---------- - block : DGLHeteroGraph, optional - If not specified, directly return the full graph with embeddings stored in - :attr:`embed_name`. Otherwise, extract and store the embeddings to the block - graph and return. - - Returns - ------- - DGLHeteroGraph - The block graph fed with embeddings. - """ - return self.embeds +# class RelGraphEmbed(nn.Module): +# r"""Embedding layer for featureless heterograph.""" +# def __init__(self, +# g, +# embed_size, +# embed_name='embed', +# activation=None, +# dropout=0.0): +# super(RelGraphEmbed, self).__init__() +# self.g = g +# self.embed_size = embed_size +# self.embed_name = embed_name +# # self.activation = activation +# # self.dropout = nn.Dropout(dropout) +# +# # create weight embeddings for each node for each relation +# self.embeds = nn.ParameterDict() +# for ntype in g.ntypes: +# embed = nn.Parameter(th.Tensor(g.number_of_nodes(ntype), self.embed_size)) +# # TODO +# # watch for activation in init +# # nn.init.xavier_uniform_(embed, gain=nn.init.calculate_gain('relu')) +# nn.init.xavier_normal_(embed) +# self.embeds[ntype] = embed +# +# def forward(self, block=None): +# """Forward computation +# +# Parameters +# ---------- +# block : DGLHeteroGraph, optional +# If not specified, directly return the full graph with embeddings stored in +# :attr:`embed_name`. Otherwise, extract and store the embeddings to the block +# graph and return. +# +# Returns +# ------- +# DGLHeteroGraph +# The block graph fed with embeddings. +# """ +# return self.embeds class RGCNSampling(nn.Module): def __init__(self, @@ -231,6 +250,8 @@ def __init__(self, self.rel_names = list(set(g.etypes)) self.rel_names.sort() + self.ntype_names = list(set(g.etypes)) + self.ntype_names.sort() if num_bases < 0 or num_bases > len(self.rel_names): self.num_bases = len(self.rel_names) else: @@ -244,14 +265,14 @@ def __init__(self, self.layer_norm = nn.ModuleList() # i2h self.layers.append(RelGraphConvLayer( - self.h_dim, self.h_dim, self.rel_names, + self.h_dim, self.h_dim, self.rel_names, self.ntype_names, self.num_bases, activation=self.activation, self_loop=self.use_self_loop, dropout=self.dropout, weight=False, use_gcn_checkpoint=use_gcn_checkpoint)) self.layer_norm.append(nn.LayerNorm([self.h_dim])) # h2h for i in range(self.num_hidden_layers): self.layers.append(RelGraphConvLayer( - self.h_dim, self.h_dim, self.rel_names, + self.h_dim, self.h_dim, self.rel_names, self.ntype_names, self.num_bases, activation=self.activation, self_loop=self.use_self_loop, dropout=self.dropout, weight=False, use_gcn_checkpoint=use_gcn_checkpoint)) # changed weight for GATConv self.layer_norm.append(nn.LayerNorm([self.h_dim])) @@ -260,7 +281,7 @@ def __init__(self, # weight=False # h2o self.layers.append(RelGraphConvLayer( - self.h_dim, self.out_dim, self.rel_names, + self.h_dim, self.out_dim, self.rel_names, self.ntype_names, self.num_bases, activation=None, self_loop=self.use_self_loop, weight=False, use_gcn_checkpoint=use_gcn_checkpoint)) # changed weight for GATConv self.layer_norm.append(nn.LayerNorm([self.out_dim])) @@ -314,9 +335,10 @@ def forward(self, h, blocks=None, # all_layers.append(h) # added this as an experimental feature for intermediate supervision # else: # minibatch training + h0 = h for layer, norm, block in zip(self.layers, self.layer_norm, blocks): # h = checkpoint.checkpoint(self.custom(layer), block, h) - h = layer(block, h) + h = layer(block, h, h0) h = self.normalize(h, norm) all_layers.append(h) # added this as an experimental feature for intermediate supervision @@ -332,6 +354,7 @@ def inference(self, batch_size, device, num_workers, x=None): For node classification, the model is trained to predict on only one node type's label. Therefore, only that type's final representation is meaningful. """ + h0 = x with th.set_grad_enabled(False): @@ -355,7 +378,7 @@ def inference(self, batch_size, device, num_workers, x=None): drop_last=False, num_workers=num_workers) - for input_nodes, output_nodes, blocks in dataloader:#tqdm.tqdm(dataloader): + for input_nodes, output_nodes, blocks in tqdm(dataloader, desc=f"Layer {l}"): block = blocks[0].to(device) if not isinstance(input_nodes, dict): @@ -363,8 +386,9 @@ def inference(self, batch_size, device, num_workers, x=None): input_nodes = {key: input_nodes} output_nodes = {key: output_nodes} + _h0 = {k: h0[k][input_nodes[k]].to(device) for k in input_nodes.keys()} h = {k: x[k][input_nodes[k]].to(device) for k in input_nodes.keys()} - h = layer(block, h) + h = layer(block, h, _h0) h = self.normalize(h, norm) for k in h.keys(): diff --git a/SourceCodeTools/models/graph/rggan.py b/SourceCodeTools/models/graph/rggan.py index ec5fc160..54fb2ce8 100644 --- a/SourceCodeTools/models/graph/rggan.py +++ b/SourceCodeTools/models/graph/rggan.py @@ -1,5 +1,7 @@ +import torch from torch.utils import checkpoint +# from SourceCodeTools.models.graph.basis_gatconv import BasisGATConv from SourceCodeTools.models.graph.rgcn_sampling import RGCNSampling, RelGraphConvLayer, CkptGATConv import torch as th @@ -37,7 +39,7 @@ def do_stuff(self, query, key, value, dummy=None): def forward(self, list_inputs, dsttype): # pylint: disable=unused-argument if len(list_inputs) == 1: return list_inputs[0] - key = value = th.stack(list_inputs).squeeze(dim=1) + key = value = th.stack(list_inputs)#.squeeze(dim=1) query = self.query_emb(th.LongTensor([token_hasher(dsttype, self.num_query_buckets)]).to(self.att.in_proj_bias.device)).unsqueeze(0).repeat(1, key.shape[1], 1) # query = self.query_emb[token_hasher(dsttype, self.num_query_buckets)].unsqueeze(0).repeat(1, key.shape[1], 1) if self.use_checkpoint: @@ -45,14 +47,42 @@ def forward(self, list_inputs, dsttype): # pylint: disable=unused-argument else: att_out, att_w = self.do_stuff(query, key, value) # att_out, att_w = self.att(query, key, value) + # return att_out.mean(0)#.unsqueeze(1) return att_out.mean(0).unsqueeze(1) +from torch.nn import init +class NZBiasGraphConv(dglnn.GraphConv): + def __init__(self, *args, **kwargs): + super(NZBiasGraphConv, self).__init__(*args, **kwargs) + self.use_checkpoint = True + self.dummy_tensor = th.ones(1, dtype=th.float32, requires_grad=True) + + def reset_parameters(self): + if self.weight is not None: + init.xavier_uniform_(self.weight) + if self.bias is not None: + init.normal_(self.bias) + + def custom(self, graph): + def custom_forward(*inputs): + feat0, feat1, weight, _ = inputs + return super(NZBiasGraphConv, self).forward(graph, (feat0, feat1), weight=weight) + return custom_forward + + def forward(self, graph, feat, weight=None): + if self.use_checkpoint: + return checkpoint.checkpoint(self.custom(graph), feat[0], feat[1], weight, self.dummy_tensor) #.squeeze(1) + else: + return super(NZBiasGraphConv, self).forward(graph, feat, weight=weight) #.squeeze(1) + + class RGANLayer(RelGraphConvLayer): def __init__(self, in_feat, out_feat, rel_names, + ntype_names, num_bases, *, weight=True, @@ -62,17 +92,40 @@ def __init__(self, dropout=0.0, use_gcn_checkpoint=False, use_att_checkpoint=False): self.use_att_checkpoint = use_att_checkpoint super(RGANLayer, self).__init__( - in_feat, out_feat, rel_names, num_bases, + in_feat, out_feat, rel_names, ntype_names, num_bases, weight=weight, bias=bias, activation=activation, self_loop=self_loop, dropout=dropout, use_gcn_checkpoint=use_gcn_checkpoint ) def create_conv(self, in_feat, out_feat, rel_names): - self.attentive_aggregator = AttentiveAggregator(out_feat, use_checkpoint=self.use_att_checkpoint) + # self.attentive_aggregator = AttentiveAggregator(out_feat, use_checkpoint=self.use_att_checkpoint) + # + # num_heads = 1 + # basis_size = 10 + # basis = torch.nn.Parameter(torch.Tensor(2, basis_size, in_feat, out_feat * num_heads)) + # attn_basis = torch.nn.Parameter(torch.Tensor(2, basis_size, num_heads, out_feat)) + # basis_coef = nn.ParameterDict({rel: torch.nn.Parameter(torch.rand(basis_size,)) for rel in rel_names}) + # + # torch.nn.init.xavier_normal_(basis, gain=1.) + # torch.nn.init.xavier_normal_(attn_basis, gain=1.) + self.conv = dglnn.HeteroGraphConv({ - rel: CkptGATConv((in_feat, in_feat), out_feat, num_heads=1, use_checkpoint=self.use_gcn_checkpoint) + rel: NZBiasGraphConv( + in_feat, out_feat, norm='right', weight=False, bias=True, allow_zero_in_degree=True,# activation=self.activation + ) + # rel: BasisGATConv( + # (in_feat, in_feat), out_feat, num_heads=num_heads, + # basis=basis, + # attn_basis=attn_basis, + # basis_coef=basis_coef[rel], + # use_checkpoint=self.use_gcn_checkpoint + # ) for rel in rel_names - }, aggregate=self.attentive_aggregator) + }, aggregate="mean") #self.attentive_aggregator) + # self.conv = dglnn.HeteroGraphConv({ + # rel: CkptGATConv((in_feat, in_feat), out_feat, num_heads=num_heads, use_checkpoint=self.use_gcn_checkpoint) + # for rel in rel_names + # }, aggregate="mean") #self.attentive_aggregator) class RGAN(RGCNSampling): @@ -92,6 +145,8 @@ def __init__(self, self.rel_names = list(set(g.etypes)) self.rel_names.sort() + self.ntype_names = list(set(g.ntypes)) + self.ntype_names.sort() if num_bases < 0 or num_bases > len(self.rel_names): self.num_bases = len(self.rel_names) else: @@ -103,14 +158,14 @@ def __init__(self, self.layers = nn.ModuleList() # i2h self.layers.append(RGANLayer( - self.h_dim, self.h_dim, self.rel_names, + self.h_dim, self.h_dim, self.rel_names, self.ntype_names, self.num_bases, activation=self.activation, self_loop=self.use_self_loop, dropout=self.dropout, weight=False, use_gcn_checkpoint=use_gcn_checkpoint, use_att_checkpoint=use_att_checkpoint)) # h2h for i in range(self.num_hidden_layers): self.layers.append(RGANLayer( - self.h_dim, self.h_dim, self.rel_names, + self.h_dim, self.h_dim, self.rel_names, self.ntype_names, self.num_bases, activation=self.activation, self_loop=self.use_self_loop, dropout=self.dropout, weight=False, use_gcn_checkpoint=use_gcn_checkpoint, use_att_checkpoint=use_att_checkpoint)) # changed weight for GATConv @@ -119,7 +174,7 @@ def __init__(self, # weight=False # h2o self.layers.append(RGANLayer( - self.h_dim, self.out_dim, self.rel_names, + self.h_dim, self.out_dim, self.rel_names, self.ntype_names, self.num_bases, activation=None, self_loop=self.use_self_loop, weight=False, use_gcn_checkpoint=use_gcn_checkpoint, use_att_checkpoint=use_att_checkpoint)) # changed weight for GATConv @@ -147,6 +202,7 @@ def __init__(self, dim, use_checkpoint=False): self.dummy_tensor = th.ones(1, dtype=th.float32, requires_grad=True) def do_stuff(self, x, h, dummy_tensor=None): + # x = x.unsqueeze(1) r = self.act_r(self.gru_rx(x) + self.gru_rh(h)) z = self.act_z(self.gru_zx(x) + self.gru_zh(h)) n = self.act_n(self.gru_nx(x) + self.gru_nh(r * h)) @@ -165,6 +221,7 @@ def __init__(self, in_feat, out_feat, rel_names, + ntype_names, num_bases, *, weight=True, @@ -173,14 +230,15 @@ def __init__(self, self_loop=False, dropout=0.0, use_gcn_checkpoint=False, use_att_checkpoint=False, use_gru_checkpoint=False): super(RGGANLayer, self).__init__( - in_feat, out_feat, rel_names, num_bases, weight=weight, bias=bias, activation=activation, + in_feat, out_feat, rel_names, ntype_names, num_bases, weight=weight, bias=bias, activation=activation, self_loop=self_loop, dropout=dropout, use_gcn_checkpoint=use_gcn_checkpoint, use_att_checkpoint=use_att_checkpoint ) + # self.mix_weights = nn.Parameter(torch.randn(3).reshape((3,1,1))) - self.gru = OneStepGRU(out_feat, use_checkpoint=use_gru_checkpoint) + # self.gru = OneStepGRU(out_feat, use_checkpoint=use_gru_checkpoint) - def forward(self, g, inputs): + def forward(self, g, inputs, h0): """Forward computation Parameters @@ -200,6 +258,10 @@ def forward(self, g, inputs): weight = self.basis() if self.use_basis else self.weight wdict = {self.rel_names[i] : {'weight' : w.squeeze(0)} for i, w in enumerate(th.split(weight, 1, dim=0))} + # if self.self_loop: + # self_loop_weight = self.loop_weight_basis() if self.use_basis else self.loop_weight + # self_loop_wdict = {self.ntype_names[i]: w.squeeze(0) + # for i, w in enumerate(th.split(self_loop_weight, 1, dim=0))} else: wdict = {} @@ -215,9 +277,15 @@ def forward(self, g, inputs): def _apply(ntype, h): if self.self_loop: + # h = h + th.matmul(inputs_dst[ntype], self_loop_wdict[ntype]) h = h + th.matmul(inputs_dst[ntype], self.loop_weight) + # mix = nn.functional.softmax(self.mix_weights, dim=0) + # h = h - inputs_dst[ntype] + th.matmul(inputs_dst[ntype], self.loop_weight) #+ h0[ntype][:h.size(0), :] + # h = h + th.matmul(inputs_dst[ntype], self.loop_weight) + h0[ntype][:h.size(0), :] + # h = (torch.stack([h, th.matmul(inputs_dst[ntype], self.loop_weight), h0[ntype][:h.size(0), :]], dim=0) * mix).sum(0) if self.bias: - h = h + self.h_bias + h = h + self.bias_dict[ntype] + # h = h + self.h_bias if self.activation: h = self.activation(h) return self.dropout(h) @@ -226,10 +294,10 @@ def _apply(ntype, h): # TODO # think of possibility switching to GAT - # return {ntype: _apply(ntype, h) for ntype, h in hs.items()} - h_gru_input = {ntype : _apply(ntype, h) for ntype, h in hs.items()} - - return {dsttype: self.gru(h_dst, inputs_dst[dsttype].unsqueeze(1)).squeeze(dim=1) for dsttype, h_dst in h_gru_input.items()} + return {ntype: _apply(ntype, h) for ntype, h in hs.items()} + # h_gru_input = {ntype : _apply(ntype, h) for ntype, h in hs.items()} + # + # return {dsttype: self.gru(h_dst, inputs_dst[dsttype].unsqueeze(1)).squeeze(dim=1) for dsttype, h_dst in h_gru_input.items()} class RGGAN(RGAN): """A gated recurrent unit (GRU) cell @@ -246,9 +314,9 @@ class RGGAN(RGAN): where :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product.""" def __init__(self, g, - h_dim, num_classes, + h_dim, node_emb_size, num_bases, - num_steps=1, + n_layers=1, dropout=0, use_self_loop=False, activation=F.relu, @@ -256,13 +324,15 @@ def __init__(self, super(RGCNSampling, self).__init__() self.g = g self.h_dim = h_dim - self.out_dim = num_classes + self.out_dim = node_emb_size self.activation = activation - assert h_dim == num_classes, f"Parameter h_dim and num_classes should be equal in {self.__class__.__name__}" + assert h_dim == node_emb_size, f"Parameter h_dim and num_classes should be equal in {self.__class__.__name__}" self.rel_names = list(set(g.etypes)) + self.ntype_names = list(set(g.ntypes)) self.rel_names.sort() + self.ntype_names.sort() if num_bases < 0 or num_bases > len(self.rel_names): self.num_bases = len(self.rel_names) else: @@ -273,16 +343,16 @@ def __init__(self, # i2h self.layer = RGGANLayer( - self.h_dim, self.h_dim, self.rel_names, + self.h_dim, self.h_dim, self.rel_names, self.ntype_names, self.num_bases, activation=self.activation, self_loop=self.use_self_loop, - dropout=self.dropout, weight=False, use_gcn_checkpoint=use_gcn_checkpoint, + dropout=self.dropout, weight=True, use_gcn_checkpoint=use_gcn_checkpoint, # : ) use_att_checkpoint=use_att_checkpoint, use_gru_checkpoint=use_gru_checkpoint ) # TODO # think of possibility switching to GAT # weight=False - self.emb_size = num_classes - self.num_layers = num_steps - self.layers = [self.layer] * num_steps + self.emb_size = node_emb_size + self.num_layers = n_layers + self.layers = [self.layer] * n_layers self.layer_norm = nn.ModuleList([nn.LayerNorm([self.h_dim]) for _ in range(self.num_layers)]) diff --git a/SourceCodeTools/models/graph/train/Scorer.py b/SourceCodeTools/models/graph/train/Scorer.py index 11fb736f..eba46e3a 100644 --- a/SourceCodeTools/models/graph/train/Scorer.py +++ b/SourceCodeTools/models/graph/train/Scorer.py @@ -1,47 +1,161 @@ import random -from collections import Iterable +import time +from collections import Iterable, defaultdict +from typing import Dict, List import torch import numpy as np -from sklearn.metrics import ndcg_score +from sklearn.metrics import ndcg_score, top_k_accuracy_score +from sklearn.neighbors import NearestNeighbors from sklearn.neighbors._ball_tree import BallTree from sklearn.preprocessing import normalize class FaissIndex: - def __init__(self, X, *args, **kwargs): + def __init__(self, X, method="inner_prod", *args, **kwargs): import faiss - self.index = faiss.IndexFlatL2(X.shape[1]) + self.method = method + if method == "inner_prod": + self.index = faiss.IndexFlatIP(X.shape[1]) + elif method == "l2": + self.index = faiss.IndexFlatL2(X.shape[1]) + else: + raise NotImplementedError() self.index.add(X.astype(np.float32)) - def query(self, X, k): + def query_inner_prod(self, X, k): + X = normalize(X, axis=1) return self.index.search(X.astype(np.float32), k=k) + def query_l2(self, X, k): + return self.index.search(X.astype(np.float32), k=k) + + def query(self, X, k): + if self.method == "inner_prod": + return self.query_inner_prod(X, k) + elif self.method == "l2": + return self.query_l2(X, k) + else: + raise NotImplementedError() + + +class Brute: + def __init__(self, X, method="inner_prod", device="cpu", *args, **kwargs): + self.vectors = X + self.method = method + self.device = device + + def query_inner_prod(self, X, k): + X = torch.Tensor(normalize(X, axis=1)).to(self.device) + vectors = torch.Tensor(self.vectors).to(self.device) + score = (vectors @ X.T).reshape(-1,).to("cpu").numpy() + ind = np.flip(np.argsort(score))[:k] + return score[ind][:k], ind + + def query_l2(self, X, k): + vectors = torch.Tensor(self.vectors).to(self.device) + X = torch.Tensor(X).to(self.device) + score = torch.norm(vectors - X, dim=-1).to("cpu").numpy() + # score = np.linalg.norm(vectors - X).reshape(-1,) + ind = np.argsort(score)[:k] + return score[ind][:k], ind + + def query(self, X, k): + assert X.shape[0] == 1 + if self.method == "inner_prod": + dist, ind = self.query_inner_prod(X, k) + elif self.method == "l2": + dist, ind = self.query_l2(X, k) + else: + raise NotImplementedError() + + return dist, ind#.reshape((-1,1)) + class Scorer: """ - Implements sampler for hard triplet loss. This sampler is useful when the loss is based on the neighbourhood + Implements sampler for triplet loss. This sampler is useful when the loss is based on the neighbourhood similarity. It becomes less useful when the decision is made by neural network because it does not need to mode points to learn how to make correct decisions. """ - def __init__(self, num_embs, emb_size, src2dst, neighbours_to_sample=5, index_backend="sklearn"): + def __init__( + self, num_embs, emb_size, src2dst: Dict[int, List[int]], neighbours_to_sample=5, index_backend="brute", + method = "inner_prod", device="cpu", ns_groups=None + ): + """ + Creates an embedding table, the embeddings in this table are updated once during an epoch. Embeddings from this + table are used for nearest neighbour queries during negative sampling. We avoid keeping track of all possible + embeddings by knowing that only part of embeddings are eligible as DST. + :param num_embs: number of unique DST + :param emb_size: embedding dimensionality + :param src2dst: Mapping from SRC to all DST, need this to find the hardest negative example for all DST at once + :param neighbours_to_sample: default number of neighbours + :param index_backend: Choose between sklearn and faiss + """ self.scorer_num_emb = num_embs self.scorer_emb_size = emb_size - self.scorer_src2dst = src2dst + self.scorer_src2dst = src2dst # mapping from src to all possible dst self.scorer_index_backend = index_backend + self.scorer_method = method + self.scorer_device = device - self.scorer_all_emb = np.ones((num_embs, emb_size)) + self.scorer_all_emb = normalize(np.ones((num_embs, emb_size)), axis=1) # unique dst embedding table self.scorer_all_keys = self.get_cand_to_score_against(None) self.scorer_key_order = dict(zip(self.scorer_all_keys, range(len(self.scorer_all_keys)))) self.scorer_index = None self.neighbours_to_sample = min(neighbours_to_sample, self.scorer_num_emb) + self.prepare_ns_groups(ns_groups) + + def prepare_ns_groups(self, ns_groups): + if ns_groups is None: + return + + self.scorer_node2ns_group = {} + self.scorer_ns_group2nodes = defaultdict(list) + + unique_dst = set() + for dsts in self.scorer_src2dst.values(): + for dst in dsts: + if isinstance(dst, tuple): + unique_dst.add(dst[0]) + else: + unique_dst.add(dst) + + for id, mentioned_in in ns_groups.values: + id_ = self.id2nodeid[id] + mentioned_in_ = self.id2nodeid[mentioned_in] + self.scorer_node2ns_group[id_] = mentioned_in_ + if id_ in unique_dst: + self.scorer_ns_group2nodes[mentioned_in_].append(id_) + + def sample_negative_from_groups(self, key_groups, k): + possible_targets = [] + for key_group in key_groups: + any_key = key_group[0] + possible_negative = self.scorer_ns_group2nodes[self.scorer_node2ns_group[any_key]] + possible_negative_ = list(set(possible_negative) - set(key_group)) + if len(possible_negative_) < k: + # backup strategy + possible_negative_.extend( + random.choices(list(set(self.scorer_all_keys) - set(key_group)), k=k - len(possible_negative_)) + ) + possible_targets.append(possible_negative_) + return possible_targets - def prepare_index(self): + + def prepare_index(self, override_strategy=None): + if self.scorer_method == "nn": + self.scorer_index = None + return if self.scorer_index_backend == "sklearn": - self.scorer_index = BallTree(self.scorer_all_emb, leaf_size=1) + self.scorer_index = NearestNeighbors() + self.scorer_index.fit(self.scorer_all_emb) + # self.scorer_index = BallTree(self.scorer_all_emb, leaf_size=1) # self.scorer_index = BallTree(normalize(self.scorer_all_emb, axis=1), leaf_size=1) elif self.scorer_index_backend == "faiss": - self.scorer_index = FaissIndex(self.scorer_all_emb) + self.scorer_index = FaissIndex(self.scorer_all_emb, method=self.scorer_method) + elif self.scorer_index_backend == "brute": + self.scorer_index = Brute(self.scorer_all_emb, method=self.scorer_method, device=self.scorer_device) else: raise ValueError(f"Unsupported backend: {self.scorer_index_backend}. Supported backends are: sklearn|faiss") @@ -51,8 +165,18 @@ def sample_closest_negative(self, ids, k=None): assert ids is not None seed_pool = [] - [seed_pool.append(self.scorer_src2dst[id]) for id in ids] - nested_negative = self.get_closest_to_keys(seed_pool, k=k+1) + for id in ids: + pool = self.scorer_src2dst[id] + if len(pool) > 0 and isinstance(pool[0], tuple): + pool = [x[0] for x in pool] + seed_pool.append(pool) + if id in self.scorer_key_order: + seed_pool[-1] = seed_pool[-1] + [id] # make sure that original list is not changed + # [seed_pool.append(self.scorer_src2dst[id]) for id in ids] + if hasattr(self, "scorer_ns_group2nodes"): + nested_negative = self.sample_negative_from_groups(seed_pool, k=k+1) + else: + nested_negative = self.get_closest_to_keys(seed_pool, k=k+1) negative = [] for neg in nested_negative: @@ -68,10 +192,11 @@ def get_closest_to_keys(self, key_groups, k=None): self.scorer_all_emb[self.scorer_key_order[key]].reshape(1, -1), k=k ) closest_keys.extend(self.scorer_all_keys[c] for c in closest.ravel()) + # ensure that negative samples do not come from positive edges closest_keys_ = list(set(closest_keys) - set(key_group)) if len(closest_keys_) == 0: # backup strategy - closest_keys_ = random.choices(list(set(self.scorer_all_keys) - set(key_group)), k=k-1) + closest_keys_ = random.choices(list(set(self.scorer_all_keys) - set(key_group)), k=k) possible_targets.append(closest_keys_) # possible_targets.extend(self.scorer_all_keys[c] for c in closest.ravel()) return possible_targets @@ -80,75 +205,263 @@ def get_closest_to_keys(self, key_groups, k=None): def set_embed(self, ids, embs): ids = np.array(list(map(self.scorer_key_order.get, ids.tolist()))) - self.scorer_all_emb[ids, :] = embs# normalize(embs, axis=1) + self.scorer_all_emb[ids, :] = normalize(embs, axis=1) if self.scorer_method == "inner_prod" else embs # for ind, id in enumerate(ids): # self.all_embs[self.key_order[id], :] = embs[ind, :] def score_candidates_cosine(self, to_score_ids, to_score_embs, keys_to_score_against, embs_to_score_against, at=None): - score_matr = (to_score_embs @ embs_to_score_against.t()) / \ - to_score_embs.norm(p=2, dim=1, keepdim=True) / \ - embs_to_score_against.norm(p=2, dim=1, keepdim=True).t() - y_pred = score_matr.tolist() + to_score_embs = to_score_embs / to_score_embs.norm(p=2, dim=1, keepdim=True) + embs_to_score_against = embs_to_score_against / embs_to_score_against.norm(p=2, dim=1, keepdim=True) + + score_matr = (to_score_embs @ embs_to_score_against.t()) + score_matr = (score_matr + 1.) / 2. + # score_matr = score_matr - self.margin + # score_matr[score_matr < 0.] = 0. + y_pred = score_matr.cpu().tolist() return y_pred - def score_candidates_lp(self, to_score_ids, to_score_embs, keys_to_score_against, embs_to_score_against, link_predictor, at=None): + def set_margin(self, margin): + self.margin = margin + + def score_candidates_l2(self, to_score_ids, to_score_embs, keys_to_score_against, embs_to_score_against, at=None): y_pred = [] for i in range(len(to_score_ids)): - input_embs = to_score_embs[i, :].repeat((embs_to_score_against.shape[0], 1)) - # predictor_input = torch.cat([input_embs, all_emb], dim=1) - y_pred.append(torch.nn.functional.softmax(link_predictor(input_embs, embs_to_score_against), dim=1)[:, 1].tolist()) # 0 - negative, 1 - positive + input_embs = to_score_embs[i, :].reshape(1, -1) + score_matr = torch.norm(embs_to_score_against - input_embs, dim=-1) + score_matr = 1. / (1. + score_matr) + # score_matr = score_matr + self.margin + # score_matr[score_matr < 0.] = 0 + y_pred.append(score_matr.cpu().tolist()) + + # embs_to_score_against = embs_to_score_against.unsqueeze(0) + # to_score_embs = to_score_embs.unsqueeze(1) + # + # score_matr = -torch.norm(embs_to_score_against - to_score_embs, dim=-1) + # score_matr = score_matr + self.margin + # score_matr[score_matr < 0.] = 0 + # y_pred = score_matr.cpu().tolist() return y_pred + def score_candidates_lp( + self, to_score_ids, to_score_embs, keys_to_score_against, embs_to_score_against, link_predictor, at=None, + with_types=None + ): + + if with_types is None: + y_pred = [] + for i in range(len(to_score_ids)): + input_embs = to_score_embs[i, :].repeat((embs_to_score_against.shape[0], 1)) + # predictor_input = torch.cat([input_embs, all_emb], dim=1) + y_pred.append( + torch.nn.functional.softmax(link_predictor(input_embs, embs_to_score_against), dim=1)[:, 1].tolist() + ) # 0 - negative, 1 - positive + + return y_pred + + else: + y_pred = [] + for i in range(len(to_score_ids)): + y_pred.append(dict()) + for type in with_types[i]: + input_embs = to_score_embs[i, :].unsqueeze(0) + # predictor_input = torch.cat([input_embs, all_emb], dim=1) + + labels = torch.LongTensor([type]).to(link_predictor.proj_matr.weight.device) + weights = link_predictor.proj_matr(labels).reshape((-1, link_predictor.rel_dim, link_predictor.input_dim)) + rels = link_predictor.rel_emb(labels) + m_a = (weights * input_embs.unsqueeze(1)).sum(-1) + m_s = (weights * embs_to_score_against.unsqueeze(1)).sum(-1) + + transl = m_a + rels + sim = torch.norm(transl - m_s, dim=-1) + sim = 1./ (1. + sim) + y_pred[-1][type] = sim.cpu().tolist() + + return y_pred + + def get_gt_candidates(self, ids): candidates = [set(list(self.scorer_src2dst[id])) for id in ids] return candidates def get_cand_to_score_against(self, ids): + """ + Generate sorted list of all possible DST. These will be used as possible targets during NDCG calculation + :param ids: + :return: + """ all_keys = set() - [all_keys.update(self.scorer_src2dst[key]) for key in self.scorer_src2dst] + for key in self.scorer_src2dst: + cand = self.scorer_src2dst[key] + for c in cand: + if isinstance(c, tuple): # happens with graphlinkclassifier objectives + all_keys.add(c[0]) + else: + all_keys.add(c) + + # [all_keys.update(self.scorer_src2dst[key]) for key in self.scorer_src2dst] return sorted(list(all_keys)) # list(self.elem2id[a] for a in all_keys) def get_embeddings_for_scoring(self, device, **kwargs): + """ + Get all embeddings as a tensor + :param device: + :param kwargs: + :return: + """ return torch.Tensor(self.scorer_all_emb).to(device) def get_keys_for_scoring(self): return self.scorer_all_keys + def hits_at_k(self, y_true, y_pred, k): + correct = y_true + predicted = y_pred + result = [] + for y_true, y_pred in zip(correct, predicted): + ind_true = set([ind for ind, y_t in enumerate(y_true) if y_t == 1.]) + ind_pred = set(list(np.flip(np.argsort(y_pred))[:k])) + result.append(len(ind_pred.intersection(ind_true)) / min(len(ind_true), k)) + + return sum(result) / len(result) + + def mean_rank(self, y_true, y_pred): + true_ranks = [] + reciprocal_ranks = [] + + correct = y_true + predicted = y_pred + for y_true, y_pred in zip(correct, predicted): + ranks = sorted(zip(y_true, y_pred), key=lambda x: x[1], reverse=True) + for ind, (true, pred) in enumerate(ranks): + if true > 0.: + true_ranks.append(ind + 1) + reciprocal_ranks.append(1 / (ind + 1)) + break # should consider only first result https://en.wikipedia.org/wiki/Mean_reciprocal_rank + + return sum(true_ranks) / len(true_ranks), sum(reciprocal_ranks) / len(reciprocal_ranks) + + def mean_average_precision(self, y_true, y_pred): + correct = y_true + predicted = y_pred + map = 0. + for y_true, y_pred in zip(correct, predicted): + ranks = sorted(zip(y_true, y_pred), key=lambda x: x[1], reverse=True) + found_relevant = 0 + avep = 0. + for ind, (true, pred) in enumerate(ranks): + if true > 0.: + found_relevant += 1 + avep += found_relevant / (ind + 1) # precision@k + avep /= found_relevant + + map += avep + + map /= len(correct) + + return map + + def get_y_true_from_candidate_list(self, candidates, keys_to_score_against): + y_true = [[1. if key in cand else 0. for key in keys_to_score_against] for cand in candidates] + return y_true + + def get_y_true_from_candidates(self, candidates, keys_to_score_against): + + has_types = isinstance(list(candidates[0])[0], tuple) + + if not has_types: + return self.get_y_true_from_candidate_list(candidates, keys_to_score_against) + + candidate_dicts = [] + for cand in candidates: + candidate_dicts.append(dict()) + for ent, type in cand: + if type not in candidate_dicts[-1]: + candidate_dicts[-1][type] = [] + candidate_dicts[-1][type].append(ent) + + y_true = [] + for cand in candidate_dicts: + y_true.append(dict()) + for type, ents in cand.items(): + y_true[-1][type] = [1. if key in ents else 0. for key in keys_to_score_against] + assert sum(y_true[-1][type]) > 0. + + return y_true + + def flatten_pred(self, y): + + flattened = [] + for x in y: + for key, scores in x.items(): + flattened.append(scores) + + return flattened + def score_candidates(self, to_score_ids, to_score_embs, link_predictor=None, at=None, type=None, device="cpu"): if at is None: at = [1, 3, 5, 10] + start = time.time() + to_score_ids = to_score_ids.tolist() - candidates = self.get_gt_candidates(to_score_ids) + candidates = self.get_gt_candidates(to_score_ids) # positive candidates # keys_to_score_against = self.get_cand_to_score_against(to_score_ids) keys_to_score_against = self.get_keys_for_scoring() - y_true = [[1. if key in cand else 0. for key in keys_to_score_against] for cand in candidates] + y_true = self.get_y_true_from_candidates(candidates, keys_to_score_against) embs_to_score_against = self.get_embeddings_for_scoring(device=to_score_embs.device) if type == "nn": + has_types = isinstance(y_true[0], dict) + y_pred = self.score_candidates_lp( to_score_ids, to_score_embs, keys_to_score_against, embs_to_score_against, - link_predictor, at=at + link_predictor, at=at, with_types=y_true if has_types else None ) elif type == "inner_prod": y_pred = self.score_candidates_cosine( to_score_ids, to_score_embs, keys_to_score_against, embs_to_score_against, at=at ) + elif type == "l2": + y_pred = self.score_candidates_l2( + to_score_ids, to_score_embs, keys_to_score_against, embs_to_score_against, at=at + ) else: raise ValueError(f"`type` can be either `nn` or `inner_prod` but `{type}` given") + has_types = isinstance(y_pred[0], dict) + + if has_types: + y_true = self.flatten_pred(y_true) + y_pred = self.flatten_pred(y_pred) + + scores = {} + # y_true_onehot = np.array(y_true) + # labels=list(range(y_true_onehot.shape[1])) + if isinstance(at, Iterable): - scores = {f"ndcg@{k}": ndcg_score(y_true, y_pred, k=k) for k in at} + scores.update({f"hits@{k}": self.hits_at_k(y_true, y_pred, k=k) for k in at}) + scores.update({f"ndcg@{k}": ndcg_score(y_true, y_pred, k=k) for k in at}) + # scores = {f"ndcg@{k}": ndcg_score(y_true, y_pred, k=k) for k in at} else: - scores = {f"ndcg@{at}": ndcg_score(y_true, y_pred, k=at)} - return scores \ No newline at end of file + scores.update({f"hits@{at}": self.hits_at_k(y_true, y_pred, k=at)}) + scores.update({f"ndcg@{at}": ndcg_score(y_true, y_pred, k=at)}) + # scores = {f"ndcg@{at}": ndcg_score(y_true, y_pred, k=at)} + + mr, mrr = self.mean_rank(y_true, y_pred) + map = self.mean_average_precision(y_true, y_pred) + scores["mr"] = mr + scores["mrr"] = mrr + scores["map"] = map + scores["scoring_time"] = time.time() - start + return scores diff --git a/SourceCodeTools/models/graph/train/deprecated/sampling_multitask.py b/SourceCodeTools/models/graph/train/deprecated/sampling_multitask.py index f17197cf..d1f361b7 100644 --- a/SourceCodeTools/models/graph/train/deprecated/sampling_multitask.py +++ b/SourceCodeTools/models/graph/train/deprecated/sampling_multitask.py @@ -11,6 +11,7 @@ from os.path import join import logging +from SourceCodeTools.mltools.torch import compute_accuracy from SourceCodeTools.models.Embedder import Embedder from SourceCodeTools.models.graph.ElementEmbedder import ElementEmbedderWithBpeSubwords from SourceCodeTools.models.graph.ElementEmbedderBase import ElementEmbedderBase @@ -19,10 +20,6 @@ from SourceCodeTools.models.graph.NodeEmbedder import NodeEmbedder -def _compute_accuracy(pred_, true_): - return torch.sum(pred_ == true_).item() / len(true_) - - class SamplingMultitaskTrainer: def __init__(self, @@ -382,7 +379,7 @@ def _evaluate_embedder(self, ee, lp, loader, neg_sampling_factor=1): logp = nn.functional.log_softmax(logits, 1) loss = nn.functional.cross_entropy(logp, labels) - acc = _compute_accuracy(logp.argmax(dim=1), labels) + acc = compute_accuracy(logp.argmax(dim=1), labels) total_loss += loss.item() total_acc += acc @@ -405,7 +402,7 @@ def _evaluate_nodes(self, ee, lp, create_api_call_loader, loader, logp = nn.functional.log_softmax(logits, 1) loss = nn.functional.cross_entropy(logp, labels) - acc = _compute_accuracy(logp.argmax(dim=1), labels) + acc = compute_accuracy(logp.argmax(dim=1), labels) total_loss += loss.item() total_acc += acc @@ -560,9 +557,9 @@ def train_all(self): input_nodes_api_call, seeds_api_call, blocks_api_call ) - train_acc_node_name = _compute_accuracy(logits_node_name.argmax(dim=1), labels_node_name) - train_acc_var_use = _compute_accuracy(logits_var_use.argmax(dim=1), labels_var_use) - train_acc_api_call = _compute_accuracy(logits_api_call.argmax(dim=1), labels_api_call) + train_acc_node_name = compute_accuracy(logits_node_name.argmax(dim=1), labels_node_name) + train_acc_var_use = compute_accuracy(logits_var_use.argmax(dim=1), labels_var_use) + train_acc_api_call = compute_accuracy(logits_api_call.argmax(dim=1), labels_api_call) train_logits = torch.cat([logits_node_name, logits_var_use, logits_api_call], 0) train_labels = torch.cat([labels_node_name, labels_var_use, labels_api_call], 0) diff --git a/SourceCodeTools/models/graph/train/objectives/AbstractObjective.py b/SourceCodeTools/models/graph/train/objectives/AbstractObjective.py index 93b10249..8e88d5fc 100644 --- a/SourceCodeTools/models/graph/train/objectives/AbstractObjective.py +++ b/SourceCodeTools/models/graph/train/objectives/AbstractObjective.py @@ -1,52 +1,96 @@ import logging from abc import abstractmethod -from collections import OrderedDict -from itertools import chain +from collections import defaultdict import dgl import torch -from torch.nn import CosineEmbeddingLoss +from tqdm import tqdm -from SourceCodeTools.code.data.sourcetrail.SubwordMasker import SubwordMasker -from SourceCodeTools.models.graph.ElementEmbedder import ElementEmbedderWithBpeSubwords, NameEmbedderWithGroups, \ - GraphLinkSampler +from SourceCodeTools.code.data.dataset.SubwordMasker import SubwordMasker +from SourceCodeTools.mltools.torch import compute_accuracy +from SourceCodeTools.models.graph.ElementEmbedder import ElementEmbedderWithBpeSubwords, GraphLinkSampler from SourceCodeTools.models.graph.ElementEmbedderBase import ElementEmbedderBase -from SourceCodeTools.models.graph.LinkPredictor import LinkPredictor, CosineLinkPredictor, BilinearLinkPedictor +from SourceCodeTools.models.graph.LinkPredictor import CosineLinkPredictor, BilinearLinkPedictor, L2LinkPredictor import torch.nn as nn -def _compute_accuracy(pred_: torch.Tensor, true_: torch.Tensor): - return torch.sum(pred_ == true_).item() / len(true_) +class ZeroEdges(Exception): + def __init__(self, *args): + super(ZeroEdges, self).__init__(*args) + + +class EarlyStoppingTracker: + def __init__(self, early_stopping_tolerance): + self.early_stopping_tolerance = early_stopping_tolerance + self.early_stopping_counter = 0 + self.early_stopping_value = 0. + self.early_stopping_trigger = False + + def should_stop(self, metric): + if metric <= self.early_stopping_value: + self.early_stopping_counter += 1 + if self.early_stopping_counter >= self.early_stopping_tolerance: + return True + else: + self.early_stopping_counter = 0 + self.early_stopping_value = metric + return False + + def reset(self): + self.early_stopping_counter = 0 + self.early_stopping_value = 0. + + +def sum_scores(s): + n = len(s) + if n == 0: + n += 1 + return sum(s) / n class AbstractObjective(nn.Module): + # # set in the init + # name = None + # graph_model = None + # sampling_neighbourhood_size = None + # batch_size = None + # target_emb_size = None + # node_embedder = None + # device = None + # masker = None + # link_predictor_type = None + # measure_scores = None + # dilate_scores = None + # early_stopping_tracker = None + # early_stopping_trigger = None + # + # # set elsewhere + # target_embedder = None + # link_predictor = None + # positive_label = None + # negative_label = None + # label_dtype = None + # + # train_loader = None + # test_loader = None + # val_loader = None + # num_train_batches = None + # num_test_batches = None + # num_val_batches = None + # + # ntypes = None + def __init__( - # self, name, objective_type, graph_model, node_embedder, nodes, data_loading_func, device, self, name, graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, - tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", masker: SubwordMasker=None, - measure_ndcg=False, dilate_ndcg=1 + tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", masker: SubwordMasker = None, + measure_scores=False, dilate_scores=1, early_stopping=False, early_stopping_tolerance=20, nn_index="brute", + ns_groups=None ): - """ - :param name: name for reference - :param objective_type: one of: graph_link_prediction|graph_link_classification|subword_ranker|node_classification - :param graph_model: - :param nodes: - :param data_loading_func: - :param device: - :param sampling_neighbourhood_size: - :param tokenizer_path: - :param target_emb_size: - :param link_predictor_type: - """ super(AbstractObjective, self).__init__() - # if objective_type not in {"graph_link_prediction", "graph_link_classification", "subword_ranker", "classification"}: - # raise NotImplementedError() - self.name = name - # self.type = objective_type self.graph_model = graph_model self.sampling_neighbourhood_size = sampling_neighbourhood_size self.batch_size = batch_size @@ -55,8 +99,13 @@ def __init__( self.device = device self.masker = masker self.link_predictor_type = link_predictor_type - self.measure_ndcg = measure_ndcg - self.dilate_ndcg = dilate_ndcg + self.measure_scores = measure_scores + self.dilate_scores = dilate_scores + self.nn_index = nn_index + self.early_stopping_tracker = EarlyStoppingTracker( + early_stopping_tolerance) if early_stopping else None + self.early_stopping_trigger = False + self.ns_groups = ns_groups self.verify_parameters() @@ -64,10 +113,12 @@ def __init__( self.create_link_predictor() self.create_loaders() + self.target_embedding_fn = self.get_targets_from_embedder + self.negative_factor = 1 + self.update_embeddings_for_queries = True + @abstractmethod def verify_parameters(self): - # if self.link_predictor_type == "inner_prod": # TODO incorrect - # assert self.target_emb_size == self.graph_model.emb_size, "Graph embedding and target embedder dimensionality should match for `inner_prod` type of link predictor." pass def create_base_element_sampler(self, data_loading_func, nodes): @@ -78,7 +129,8 @@ def create_base_element_sampler(self, data_loading_func, nodes): def create_graph_link_sampler(self, data_loading_func, nodes): self.target_embedder = GraphLinkSampler( elements=data_loading_func(), nodes=nodes, compact_dst=False, dst_to_global=True, - emb_size=self.target_emb_size + emb_size=self.target_emb_size, device=self.device, method=self.link_predictor_type, nn_index=self.nn_index, + ns_groups=self.ns_groups ) def create_subword_embedder(self, data_loading_func, nodes, tokenizer_path): @@ -89,64 +141,90 @@ def create_subword_embedder(self, data_loading_func, nodes, tokenizer_path): @abstractmethod def create_target_embedder(self, data_loading_func, nodes, tokenizer_path): - # # create target embedder - # if self.type == "graph_link_prediction" or self.type == "graph_link_classification": - # self.create_base_element_sampler(data_loading_func, nodes) - # elif self.type == "subword_ranker": - # self.create_subword_embedder(data_loading_func, nodes, tokenizer_path) - # elif self.type == "node_classification": - # self.target_embedder = None + # self.create_base_element_sampler(data_loading_func, nodes) + # self.create_graph_link_sampler(data_loading_func, nodes) + # self.create_subword_embedder(data_loading_func, nodes, tokenizer_path) raise NotImplementedError() def create_nn_link_predictor(self): - # self.link_predictor = LinkPredictor(self.target_emb_size + self.graph_model.emb_size).to(self.device) - self.link_predictor = BilinearLinkPedictor(self.target_emb_size, self.graph_model.emb_size, 2).to(self.device) + self.link_predictor = BilinearLinkPedictor( + self.target_emb_size, self.graph_model.emb_size, 2).to(self.device) self.positive_label = 1 self.negative_label = 0 self.label_dtype = torch.long def create_inner_prod_link_predictor(self): - self.link_predictor = CosineLinkPredictor().to(self.device) - self.cosine_loss = CosineEmbeddingLoss(margin=0.4) + self.margin = -0.2 + self.target_embedder.set_margin(self.margin) + self.link_predictor = CosineLinkPredictor( + margin=self.margin).to(self.device) + self.hinge_loss = nn.HingeEmbeddingLoss(margin=1. - self.margin) + + def cosine_loss(x1, x2, label): + sim = nn.CosineSimilarity() + dist = 1. - sim(x1, x2) + return self.hinge_loss(dist, label) + + # self.cosine_loss = CosineEmbeddingLoss(margin=self.margin) + self.cosine_loss = cosine_loss + self.positive_label = 1. + self.negative_label = -1. + self.label_dtype = torch.float32 + + def create_l2_link_predictor(self): + self.margin = 2.0 + self.target_embedder.set_margin(self.margin) + self.link_predictor = L2LinkPredictor().to(self.device) + # self.hinge_loss = nn.HingeEmbeddingLoss(margin=self.margin) + self.triplet_loss = nn.TripletMarginLoss(margin=self.margin) + + def l2_loss(x1, x2, label): + half = x1.shape[0] // 2 + pos = x2[:half, :] + neg = x2[half:, :] + + return self.triplet_loss(x1[:half, :], pos, neg) + # dist = torch.norm(x1 - x2, dim=-1) + # return self.hinge_loss(dist, label) + + self.l2_loss = l2_loss self.positive_label = 1. self.negative_label = -1. self.label_dtype = torch.float32 - @abstractmethod def create_link_predictor(self): - # # create link predictors - # if self.type in {"graph_link_prediction", "graph_link_classification", "subword_ranker"}: - # if self.link_predictor_type == "nn": - # self.create_nn_link_predictor() - # elif self.link_predictor_type == "inner_prod": - # self.create_inner_prod_link_predictor() - # else: - # raise NotImplementedError() - # else: - # # for node classifier - # self.link_predictor = LinkPredictor(self.graph_model.emb_size).to(self.device) - raise NotImplementedError() + if self.link_predictor_type == "nn": + self.create_nn_link_predictor() + elif self.link_predictor_type == "inner_prod": + self.create_inner_prod_link_predictor() + elif self.link_predictor_type == "l2": + self.create_l2_link_predictor() + else: + raise NotImplementedError() - # @abstractmethod def create_loaders(self): - # create loaders - # if self.type in {"graph_link_prediction", "graph_link_classification", "subword_ranker"}: + print("Number of nodes", self.graph_model.g.number_of_nodes()) train_idx, val_idx, test_idx = self._get_training_targets() train_idx, val_idx, test_idx = self.target_embedder.create_idx_pools( train_idx=train_idx, val_idx=val_idx, test_idx=test_idx ) - # else: - # raise NotImplementedError() logging.info( f"Pool sizes for {self.name}: train {self._idx_len(train_idx)}, " f"val {self._idx_len(val_idx)}, " f"test {self._idx_len(test_idx)}." ) - self.train_loader, self.test_loader, self.val_loader = self._get_loaders( + self.train_loader, self.val_loader, self.test_loader = self._get_loaders( train_idx=train_idx, val_idx=val_idx, test_idx=test_idx, batch_size=self.batch_size # batch_size_node_name ) + def get_num_nodes(ids): + return sum(len(ids[key_]) for key_ in ids) // self.batch_size + 1 + + self.num_train_batches = get_num_nodes(train_idx) + self.num_test_batches = get_num_nodes(test_idx) + self.num_val_batches = get_num_nodes(val_idx) + def _idx_len(self, idx): if isinstance(idx, dict): length = 0 @@ -160,7 +238,8 @@ def _handle_non_unique(self, non_unique_ids): id_list = non_unique_ids.tolist() unique_ids = list(set(id_list)) new_position = dict(zip(unique_ids, range(len(unique_ids)))) - slice_map = torch.tensor(list(map(lambda x: new_position[x], id_list)), dtype=torch.long) + slice_map = torch.tensor( + list(map(lambda x: new_position[x], id_list)), dtype=torch.long) return torch.tensor(unique_ids, dtype=torch.long), slice_map def _get_training_targets(self): @@ -179,49 +258,54 @@ def _get_training_targets(self): # labels = labels[key] self.use_types = False - train_idx = { - ntype: torch.nonzero(self.graph_model.g.nodes[ntype].data['train_mask'], as_tuple=False).squeeze() - for ntype in self.ntypes - } - val_idx = { - ntype: torch.nonzero(self.graph_model.g.nodes[ntype].data['val_mask'], as_tuple=False).squeeze() - for ntype in self.ntypes - } - test_idx = { - ntype: torch.nonzero(self.graph_model.g.nodes[ntype].data['test_mask'], as_tuple=False).squeeze() - for ntype in self.ntypes - } + def get_targets(data_label): + return { + ntype: torch.nonzero( + self.graph_model.g.nodes[ntype].data[data_label], as_tuple=False).squeeze() + for ntype in self.ntypes + } + + train_idx = get_targets("train_mask") + val_idx = get_targets("val_mask") + test_idx = get_targets("test_mask") + + # train_idx = { + # ntype: torch.nonzero(self.graph_model.g.nodes[ntype].data['train_mask'], as_tuple=False).squeeze() + # for ntype in self.ntypes + # } + # val_idx = { + # ntype: torch.nonzero(self.graph_model.g.nodes[ntype].data['val_mask'], as_tuple=False).squeeze() + # for ntype in self.ntypes + # } + # test_idx = { + # ntype: torch.nonzero(self.graph_model.g.nodes[ntype].data['test_mask'], as_tuple=False).squeeze() + # for ntype in self.ntypes + # } else: # not sure when this is called raise NotImplementedError() - # self.ntypes = None - # # labels = g.ndata['labels'] - # train_idx = self.graph_model.g.ndata['train_mask'] - # val_idx = self.graph_model.g.ndata['val_mask'] - # test_idx = self.graph_model.g.ndata['test_mask'] - # self.use_types = False return train_idx, val_idx, test_idx - def _get_loaders(self, train_idx, val_idx, test_idx, batch_size): - # train sampler - layers = self.graph_model.num_layers - sampler = dgl.dataloading.MultiLayerNeighborSampler([self.sampling_neighbourhood_size] * layers) + def _create_loader(self, ids, batch_size=None, shuffle=False): + if batch_size is None: + # TODO + # only works when ids do not have types + batch_size = self._idx_len(ids) + sampler = dgl.dataloading.MultiLayerFullNeighborSampler( + self.graph_model.num_layers) loader = dgl.dataloading.NodeDataLoader( - self.graph_model.g, train_idx, sampler, batch_size=batch_size, shuffle=False, num_workers=0) + self.graph_model.g, ids, sampler, batch_size=batch_size, shuffle=shuffle, num_workers=0) + return loader - # validation sampler - # we do not use full neighbor to save computation resources - val_sampler = dgl.dataloading.MultiLayerNeighborSampler([self.sampling_neighbourhood_size] * layers) - val_loader = dgl.dataloading.NodeDataLoader( - self.graph_model.g, val_idx, val_sampler, batch_size=batch_size, shuffle=False, num_workers=0) + def _get_loaders(self, train_idx, val_idx, test_idx, batch_size): - # we do not use full neighbor to save computation resources - test_sampler = dgl.dataloading.MultiLayerNeighborSampler([self.sampling_neighbourhood_size] * layers) - test_loader = dgl.dataloading.NodeDataLoader( - self.graph_model.g, test_idx, test_sampler, batch_size=batch_size, shuffle=False, num_workers=0) + train_loader = self._create_loader( + train_idx, batch_size, shuffle=False) + val_loader = self._create_loader(val_idx, batch_size, shuffle=False) + test_loader = self._create_loader(test_idx, batch_size, shuffle=False) - return loader, val_loader, test_loader + return train_loader, val_loader, test_loader def reset_iterator(self, data_split): iter_name = f"{data_split}_loader_iter" @@ -230,14 +314,14 @@ def reset_iterator(self, data_split): def loader_next(self, data_split): iter_name = f"{data_split}_loader_iter" if not hasattr(self, iter_name): - setattr(self, iter_name, iter(getattr(self, f"{data_split}_loader"))) + setattr(self, iter_name, iter( + getattr(self, f"{data_split}_loader"))) return next(getattr(self, iter_name)) - def _create_loader(self, indices): - sampler = dgl.dataloading.MultiLayerNeighborSampler( - [self.sampling_neighbourhood_size] * self.graph_model.num_layers) - return dgl.dataloading.NodeDataLoader( - self.graph_model.g, indices, sampler, batch_size=len(indices), num_workers=0) + # def _create_loader(self, indices): + # sampler = dgl.dataloading.MultiLayerFullNeighborSampler(self.graph_model.num_layers) + # return dgl.dataloading.NodeDataLoader( + # self.graph_model.g, indices, sampler, batch_size=len(indices), num_workers=0) def _extract_embed(self, input_nodes, train_embeddings=True, masked=None): emb = {} @@ -257,30 +341,48 @@ def compute_acc_loss(self, node_embs_, element_embs_, labels): elif self.link_predictor_type == "inner_prod": loss = self.cosine_loss(node_embs_, element_embs_, labels) labels[labels < 0] = 0 + elif self.link_predictor_type == "l2": + loss = self.l2_loss(node_embs_, element_embs_, labels) + labels[labels < 0] = 0 + # num_examples = len(labels) // 2 + # anchor = node_embs_[:num_examples, :] + # positive = element_embs_[:num_examples, :] + # negative = element_embs_[num_examples:, :] + # # pos_labels_ = labels[:num_examples] + # # neg_labels_ = labels[num_examples:] + # margin = 1. + # triplet = nn.TripletMarginLoss(margin=margin) + # self.target_embedder.set_margin(margin) + # loss = triplet(anchor, positive, negative) + # logits = (torch.norm(node_embs_ - element_embs_, keepdim=True) < 1.).float() + # logits = torch.cat([1 - logits, logits], dim=1) + # labels[labels < 0] = 0 else: raise NotImplementedError() - acc = _compute_accuracy(logits.argmax(dim=1), labels) + acc = compute_accuracy(logits.argmax(dim=1), labels) return acc, loss - - def _logits_batch(self, input_nodes, blocks, train_embeddings=True, masked=None): + def _graph_embeddings(self, input_nodes, blocks, train_embeddings=True, masked=None): cumm_logits = [] if self.use_types: # emb = self._extract_embed(self.graph_model.node_embed(), input_nodes) - emb = self._extract_embed(input_nodes, train_embeddings, masked=masked) + emb = self._extract_embed( + input_nodes, train_embeddings, masked=masked) else: if self.ntypes is not None: # single node type key = next(iter(self.ntypes)) input_nodes = {key: input_nodes} # emb = self._extract_embed(self.graph_model.node_embed(), input_nodes) - emb = self._extract_embed(input_nodes, train_embeddings, masked=masked) + emb = self._extract_embed( + input_nodes, train_embeddings, masked=masked) else: - emb = self.node_embedder(node_ids=input_nodes, train_embeddings=train_embeddings, masked=masked) + emb = self.node_embedder( + node_ids=input_nodes, train_embeddings=train_embeddings, masked=masked) # emb = self.graph_model.node_embed()[input_nodes] logits = self.graph_model(emb, blocks) @@ -307,96 +409,184 @@ def _logits_batch(self, input_nodes, blocks, train_embeddings=True, masked=None) def seeds_to_global(self, seeds): if type(seeds) is dict: - indices = [self.graph_model.g.nodes[ntype].data["global_graph_id"][seeds[ntype]] for ntype in seeds] + indices = [self.graph_model.g.nodes[ntype].data["global_graph_id"] + [seeds[ntype]] for ntype in seeds] return torch.cat(indices, dim=0) else: return seeds - def _logits_embedder(self, node_embeddings, elem_embedder, link_predictor, seeds, negative_factor=1): - k = negative_factor - indices = self.seeds_to_global(seeds).tolist() - batch_size = len(indices) + def sample_negative(self, ids, k, neg_sampling_strategy): + if neg_sampling_strategy is not None: + negative = self.target_embedder.sample_negative( + k, ids=ids, strategy=neg_sampling_strategy + ) + else: + negative = self.target_embedder.sample_negative( + k, ids=ids, + ) + return negative - node_embeddings_batch = node_embeddings - element_embeddings = elem_embedder(elem_embedder[indices].to(self.device)) + def get_targets_from_nodes( + self, positive_indices, negative_indices=None, train_embeddings=True + ): + negative_indices = torch.tensor( + negative_indices, dtype=torch.long) if negative_indices is not None else None - # labels_pos = torch.ones(batch_size, dtype=torch.long) - labels_pos = torch.full((batch_size,), self.positive_label, dtype=self.label_dtype) + def get_embeddings_for_targets(dst): + unique_dst, slice_map = self._handle_non_unique(dst) + assert unique_dst[slice_map].tolist() == dst.tolist() - node_embeddings_neg_batch = node_embeddings_batch.repeat(k, 1) - # negative_random = elem_embedder(elem_embedder.sample_negative(batch_size * k).to(self.device)) - negative_random = elem_embedder(elem_embedder.sample_negative(batch_size * k, ids=indices).to(self.device)) # closest negative + dataloader = self._create_loader(unique_dst) + input_nodes, dst_seeds, blocks = next(iter(dataloader)) + blocks = [blk.to(self.device) for blk in blocks] + assert dst_seeds.shape == unique_dst.shape + assert dst_seeds.tolist() == unique_dst.tolist() + unique_dst_embeddings = self._graph_embeddings( + input_nodes, blocks, train_embeddings) # use_types, ntypes) + dst_embeddings = unique_dst_embeddings[slice_map.to(self.device)] + + if self.update_embeddings_for_queries: + self.target_embedder.set_embed(unique_dst.detach().cpu().numpy(), + unique_dst_embeddings.detach().cpu().numpy()) + + return dst_embeddings - # labels_neg = torch.zeros(batch_size * k, dtype=torch.long) - labels_neg = torch.full((batch_size * k,), self.negative_label, dtype=self.label_dtype) + positive_dst = get_embeddings_for_targets(positive_indices) + negative_dst = get_embeddings_for_targets( + negative_indices) if negative_indices is not None else None + return positive_dst, negative_dst - # positive_batch = torch.cat([node_embeddings_batch, element_embeddings], 1) - # negative_batch = torch.cat([node_embeddings_neg_batch, negative_random], 1) - # batch = torch.cat([positive_batch, negative_batch], 0) - # labels = torch.cat([labels_pos, labels_neg], 0).to(self.device) + def get_targets_from_embedder( + self, positive_indices, negative_indices=None, train_embeddings=True + ): + + # def get_embeddings_for_targets(dst): + # unique_dst, slice_map = self._handle_non_unique(dst) + # assert unique_dst[slice_map].tolist() == dst.tolist() + # unique_dst_embeddings = self.target_embedder(unique_dst.to(self.device)) + # dst_embeddings = unique_dst_embeddings[slice_map.to(self.device)] # - # logits = link_predictor(batch) + # if self.update_embeddings_for_queries: + # self.target_embedder.set_embed(unique_dst.detach().cpu().numpy(), + # unique_dst_embeddings.detach().cpu().numpy()) # - # return logits, labels - nodes = torch.cat([node_embeddings_batch, node_embeddings_neg_batch], dim=0) - embs = torch.cat([element_embeddings, negative_random], dim=0) - labels = torch.cat([labels_pos, labels_neg], 0).to(self.device) - return nodes, embs, labels + # return dst_embeddings + + positive_dst = self.target_embedder(positive_indices.to(self.device)) + negative_dst = self.target_embedder(negative_indices.to( + self.device)) if negative_indices is not None else None + # + # positive_dst = get_embeddings_for_targets(positive_indices) + # negative_dst = get_embeddings_for_targets(negative_indices) if negative_indices is not None else None - def _logits_nodes(self, node_embeddings, - elem_embedder, link_predictor, create_dataloader, - src_seeds, negative_factor=1, train_embeddings=True): + return positive_dst, negative_dst + + def create_positive_labels(self, ids): + return torch.full((len(ids),), self.positive_label, dtype=self.label_dtype) + + def create_negative_labels(self, ids, k): + return torch.full((len(ids) * k,), self.negative_label, dtype=self.label_dtype) + + def prepare_for_prediction( + self, node_embeddings, seeds, target_embedding_fn, negative_factor=1, + neg_sampling_strategy=None, train_embeddings=True, + ): k = negative_factor - indices = self.seeds_to_global(src_seeds).tolist() + indices = self.seeds_to_global(seeds).tolist() batch_size = len(indices) node_embeddings_batch = node_embeddings - next_call_indices = elem_embedder[indices] # this assumes indices is torch tensor - - # dst targets are not unique - unique_dst, slice_map = self._handle_non_unique(next_call_indices) - assert unique_dst[slice_map].tolist() == next_call_indices.tolist() - - dataloader = create_dataloader(unique_dst) - input_nodes, dst_seeds, blocks = next(iter(dataloader)) - blocks = [blk.to(self.device) for blk in blocks] - assert dst_seeds.shape == unique_dst.shape - assert dst_seeds.tolist() == unique_dst.tolist() - unique_dst_embeddings = self._logits_batch(input_nodes, blocks, train_embeddings) # use_types, ntypes) - next_call_embeddings = unique_dst_embeddings[slice_map.to(self.device)] - # labels_pos = torch.ones(batch_size, dtype=torch.long) - labels_pos = torch.full((batch_size,), self.positive_label, dtype=self.label_dtype) - node_embeddings_neg_batch = node_embeddings_batch.repeat(k, 1) - # negative_indices = torch.tensor(elem_embedder.sample_negative( - # batch_size * k), dtype=torch.long) # embeddings are sampled from 3/4 unigram distribution - negative_indices = torch.tensor(elem_embedder.sample_negative( - batch_size * k, ids=indices), dtype=torch.long) # closest negative - unique_negative, slice_map = self._handle_non_unique(negative_indices) - assert unique_negative[slice_map].tolist() == negative_indices.tolist() - - dataloader = create_dataloader(unique_negative) - input_nodes, dst_seeds, blocks = next(iter(dataloader)) - blocks = [blk.to(self.device) for blk in blocks] - assert dst_seeds.shape == unique_negative.shape - assert dst_seeds.tolist() == unique_negative.tolist() - unique_negative_random = self._logits_batch(input_nodes, blocks, train_embeddings) # use_types, ntypes) - negative_random = unique_negative_random[slice_map.to(self.device)] - # labels_neg = torch.zeros(batch_size * k, dtype=torch.long) - labels_neg = torch.full((batch_size * k,), self.negative_label, dtype=self.label_dtype) - - # positive_batch = torch.cat([node_embeddings_batch, next_call_embeddings], 1) - # negative_batch = torch.cat([node_embeddings_neg_batch, negative_random], 1) - # batch = torch.cat([positive_batch, negative_batch], 0) - # labels = torch.cat([labels_pos, labels_neg], 0).to(self.device) - # - # logits = link_predictor(batch) - # - # return logits, labels - nodes = torch.cat([node_embeddings_batch, node_embeddings_neg_batch], dim=0) - embs = torch.cat([next_call_embeddings, negative_random], dim=0) + + positive_indices = self.target_embedder[indices] + negative_indices = self.sample_negative( + k=batch_size * k, ids=indices, neg_sampling_strategy=neg_sampling_strategy + ) + + positive_dst, negative_dst = target_embedding_fn( + positive_indices, negative_indices, train_embeddings + ) + + # TODO breaks cache in + # SourceCodeTools.models.graph.train.objectives.GraphLinkClassificationObjective.TargetLinkMapper.get_labels + labels_pos = self.create_positive_labels(indices) + labels_neg = self.create_negative_labels(indices, k) + + src_embs = torch.cat( + [node_embeddings_batch, node_embeddings_neg_batch], dim=0) + dst_embs = torch.cat([positive_dst, negative_dst], dim=0) labels = torch.cat([labels_pos, labels_neg], 0).to(self.device) - return nodes, embs, labels + return src_embs, dst_embs, labels + + # def _logits_embedder( + # self, node_embeddings, elem_embedder, link_predictor, seeds, negative_factor=1, neg_sampling_strategy=None + # ): + # k = negative_factor + # indices = self.seeds_to_global(seeds).tolist() + # batch_size = len(indices) + # + # node_embeddings_batch = node_embeddings + # node_embeddings_neg_batch = node_embeddings_batch.repeat(k, 1) + # + # element_embeddings = elem_embedder(elem_embedder[indices].to(self.device)) + # negative_random = elem_embedder(self.sample_negative( + # k=batch_size * k, ids=indices, neg_sampling_strategy=neg_sampling_strategy + # ).to(self.device)) + # + # labels_pos = torch.full((batch_size,), self.positive_label, dtype=self.label_dtype) + # labels_neg = torch.full((batch_size * k,), self.negative_label, dtype=self.label_dtype) + # + # src_embs = torch.cat([node_embeddings_batch, node_embeddings_neg_batch], dim=0) + # dst_embs = torch.cat([element_embeddings, negative_random], dim=0) + # labels = torch.cat([labels_pos, labels_neg], 0).to(self.device) + # return src_embs, dst_embs, labels + # + # def _logits_nodes( + # self, node_embeddings, elem_embedder, link_predictor, create_dataloader, + # src_seeds, negative_factor=1, train_embeddings=True, neg_sampling_strategy=None, + # update_embeddings_for_queries=False + # ): + # k = negative_factor + # indices = self.seeds_to_global(src_seeds).tolist() + # batch_size = len(indices) + # + # node_embeddings_batch = node_embeddings + # node_embeddings_neg_batch = node_embeddings_batch.repeat(k, 1) + # + # next_call_indices = elem_embedder[indices] # this assumes indices is torch tensor + # negative_indices = torch.tensor(self.sample_negative( + # k=batch_size * k, ids=indices, neg_sampling_strategy=neg_sampling_strategy + # ), dtype=torch.long) + # + # # dst targets are not unique + # def get_embeddings_for_targets(dst, update_embeddings_for_queries): + # unique_dst, slice_map = self._handle_non_unique(dst) + # assert unique_dst[slice_map].tolist() == dst.tolist() + # + # dataloader = create_dataloader(unique_dst) + # input_nodes, dst_seeds, blocks = next(iter(dataloader)) + # blocks = [blk.to(self.device) for blk in blocks] + # assert dst_seeds.shape == unique_dst.shape + # assert dst_seeds.tolist() == unique_dst.tolist() + # unique_dst_embeddings = self._graph_embeddings(input_nodes, blocks, train_embeddings) # use_types, ntypes) + # dst_embeddings = unique_dst_embeddings[slice_map.to(self.device)] + # + # if update_embeddings_for_queries: + # self.target_embedder.set_embed(unique_dst.detach().cpu().numpy(), + # unique_dst_embeddings.detach().cpu().numpy()) + # + # return dst_embeddings + # + # next_call_embeddings = get_embeddings_for_targets(next_call_indices, update_embeddings_for_queries) + # negative_random = get_embeddings_for_targets(negative_indices, update_embeddings_for_queries) + # + # labels_pos = torch.full((batch_size,), self.positive_label, dtype=self.label_dtype) + # labels_neg = torch.full((batch_size * k,), self.negative_label, dtype=self.label_dtype) + # + # src_embs = torch.cat([node_embeddings_batch, node_embeddings_neg_batch], dim=0) + # dst_embs = torch.cat([next_call_embeddings, negative_random], dim=0) + # labels = torch.cat([labels_pos, labels_neg], 0).to(self.device) + # return src_embs, dst_embs, labels def seeds_to_python(self, seeds): if isinstance(seeds, dict): @@ -407,88 +597,34 @@ def seeds_to_python(self, seeds): python_seeds = seeds.tolist() return python_seeds - @abstractmethod - def forward(self, input_nodes, seeds, blocks, train_embeddings=True): - # masked = None - # if self.type in {"subword_ranker"}: - # masked = self.masker.get_mask(self.seeds_to_python(seeds)) - # graph_emb = self._logits_batch(input_nodes, blocks, train_embeddings, masked=masked) - # if self.type in {"subword_ranker"}: - # # logits, labels = self._logits_embedder(graph_emb, self.target_embedder, self.link_predictor, seeds) - # node_embs_, element_embs_, labels = self._logits_embedder(graph_emb, self.target_embedder, self.link_predictor, seeds) - # elif self.type in {"graph_link_prediction", "graph_link_classification"}: - # # logits, labels = self._logits_nodes(graph_emb, self.target_embedder, self.link_predictor, - # # self._create_loader, seeds, train_embeddings=train_embeddings) - # node_embs_, element_embs_, labels = self._logits_nodes(graph_emb, self.target_embedder, self.link_predictor, - # self._create_loader, seeds, train_embeddings=train_embeddings) - # else: - # raise NotImplementedError() - # - # # logits = self.link_predictor(node_embs_, element_embs_) - # # - # # acc = _compute_accuracy(logits.argmax(dim=1), labels) - # # logp = nn.functional.log_softmax(logits, 1) - # # loss = nn.functional.nll_loss(logp, labels) - # acc, loss = self.compute_acc_loss(node_embs_, element_embs_, labels) - # - # return loss, acc - raise NotImplementedError() - - def _evaluate_embedder(self, ee, lp, data_split, neg_sampling_factor=1): - - total_loss = 0 - total_acc = 0 - ndcg_at = [1, 3, 5, 10] - total_ndcg = {f"ndcg@{k}": 0. for k in ndcg_at} - ndcg_count = 0 - count = 0 - - # if self.measure_ndcg: - # if self.link_predictor == "inner_prod": - # self.target_embedder.prepare_index() - - for input_nodes, seeds, blocks in getattr(self, f"{data_split}_loader"): - blocks = [blk.to(self.device) for blk in blocks] - - if self.masker is None: - masked = None - else: - masked = self.masker.get_mask(self.seeds_to_python(seeds)) - - src_embs = self._logits_batch(input_nodes, blocks, masked=masked) - # logits, labels = self._logits_embedder(src_embs, ee, lp, seeds, neg_sampling_factor) - node_embs_, element_embs_, labels = self._logits_embedder(src_embs, ee, lp, seeds, neg_sampling_factor) - - if self.measure_ndcg: - if count % self.dilate_ndcg == 0: - ndcg = self.target_embedder.score_candidates(self.seeds_to_global(seeds), src_embs, self.link_predictor, at=ndcg_at, type=self.link_predictor_type, device=self.device) - for key, val in ndcg.items(): - total_ndcg[key] = total_ndcg[key] + val - ndcg_count += 1 - - # logits = self.link_predictor(node_embs_, element_embs_) - # - # logp = nn.functional.log_softmax(logits, 1) - # loss = nn.functional.cross_entropy(logp, labels) - # acc = _compute_accuracy(logp.argmax(dim=1), labels) - - acc, loss = self.compute_acc_loss(node_embs_, element_embs_, labels) + def forward(self, input_nodes, seeds, blocks, train_embeddings=True, neg_sampling_strategy=None): + masked = self.masker.get_mask(self.seeds_to_python( + seeds)) if self.masker is not None else None + graph_emb = self._graph_embeddings( + input_nodes, blocks, train_embeddings, masked=masked) + node_embs_, element_embs_, labels = self.prepare_for_prediction( + graph_emb, seeds, self.target_embedding_fn, negative_factor=self.negative_factor, + neg_sampling_strategy=neg_sampling_strategy, + train_embeddings=train_embeddings + ) - total_loss += loss.item() - total_acc += acc - count += 1 - return total_loss / count, total_acc / count, {key: val / ndcg_count for key, val in total_ndcg.items()} if self.measure_ndcg else None + acc, loss = self.compute_acc_loss(node_embs_, element_embs_, labels) - def _evaluate_nodes(self, ee, lp, create_api_call_loader, data_split, neg_sampling_factor=1): + return loss, acc - total_loss = 0 - total_acc = 0 - ndcg_at = [1, 3, 5, 10] - total_ndcg = {f"ndcg@{k}": 0. for k in ndcg_at} - ndcg_count = 0 + def evaluate_objective(self, data_split, neg_sampling_strategy=None, negative_factor=1): + # total_loss = 0 + # total_acc = 0 + at = [1, 3, 5, 10] + # total_ndcg = {f"ndcg@{k}": 0. for k in ndcg_at} + # ndcg_count = 0 count = 0 - for input_nodes, seeds, blocks in getattr(self, f"{data_split}_loader"): + scores = defaultdict(list) + + for input_nodes, seeds, blocks in tqdm( + getattr(self, f"{data_split}_loader"), total=getattr(self, f"num_{data_split}_batches") + ): blocks = [blk.to(self.device) for blk in blocks] if self.masker is None: @@ -496,93 +632,155 @@ def _evaluate_nodes(self, ee, lp, create_api_call_loader, data_split, neg_sampli else: masked = self.masker.get_mask(self.seeds_to_python(seeds)) - src_embs = self._logits_batch(input_nodes, blocks, masked=masked) - # logits, labels = self._logits_nodes(src_embs, ee, lp, create_api_call_loader, seeds, neg_sampling_factor) - node_embs_, element_embs_, labels = self._logits_nodes(src_embs, ee, lp, create_api_call_loader, seeds, neg_sampling_factor) - - if self.measure_ndcg: - if count % self.dilate_ndcg == 0: - ndcg = self.target_embedder.score_candidates(self.seeds_to_global(seeds), src_embs, self.link_predictor, at=ndcg_at, type=self.link_predictor_type, device=self.device) - for key, val in ndcg.items(): - total_ndcg[key] = total_ndcg[key] + val - ndcg_count += 1 - - # logits = self.link_predictor(node_embs_, element_embs_) - # - # logp = nn.functional.log_softmax(logits, 1) - # loss = nn.functional.cross_entropy(logp, labels) - # acc = _compute_accuracy(logp.argmax(dim=1), labels) - - acc, loss = self.compute_acc_loss(node_embs_, element_embs_, labels) - - total_loss += loss.item() - total_acc += acc + src_embs = self._graph_embeddings( + input_nodes, blocks, masked=masked) + node_embs_, element_embs_, labels = self.prepare_for_prediction( + src_embs, seeds, self.target_embedding_fn, negative_factor=negative_factor, + neg_sampling_strategy=neg_sampling_strategy, + train_embeddings=False + ) + + if self.measure_scores: + if count % self.dilate_scores == 0: + scores_ = self.target_embedder.score_candidates(self.seeds_to_global(seeds), src_embs, + self.link_predictor, at=at, + type=self.link_predictor_type, device=self.device) + for key, val in scores_.items(): + scores[key].append(val) + + acc, loss = self.compute_acc_loss( + node_embs_, element_embs_, labels) + + scores["Loss"].append(loss.item()) + scores["Accuracy"].append(acc) count += 1 - return total_loss / count, total_acc / count, {key: val / ndcg_count for key, val in total_ndcg.items()} if self.measure_ndcg else None - @abstractmethod - def evaluate(self, data_split, neg_sampling_factor=1): - # if self.type in {"subword_ranker"}: - # loss, acc, ndcg = self._evaluate_embedder( - # self.target_embedder, self.link_predictor, data_split=data_split, neg_sampling_factor=neg_sampling_factor - # ) - # elif self.type in {"graph_link_prediction", "graph_link_classification"}: - # loss, acc = self._evaluate_nodes( - # self.target_embedder, self.link_predictor, self._create_loader, data_split=data_split, - # neg_sampling_factor=neg_sampling_factor - # ) - # ndcg = None - # else: - # raise NotImplementedError() - # - # return loss, acc, ndcg - raise NotImplementedError() + scores = {key: sum_scores(val) for key, val in scores.items()} + return scores + # return total_loss / count, total_acc / count, {key: val / ndcg_count for key, val in + # total_ndcg.items()} if self.measure_scores else None + + # def _evaluate_embedder(self, ee, lp, data_split, neg_sampling_factor=1): + # + # total_loss = 0 + # total_acc = 0 + # ndcg_at = [1, 3, 5, 10] + # total_ndcg = {f"ndcg@{k}": 0. for k in ndcg_at} + # ndcg_count = 0 + # count = 0 + # + # for input_nodes, seeds, blocks in tqdm( + # getattr(self, f"{data_split}_loader"), total=getattr(self, f"num_{data_split}_batches") + # ): + # blocks = [blk.to(self.device) for blk in blocks] + # + # if self.masker is None: + # masked = None + # else: + # masked = self.masker.get_mask(self.seeds_to_python(seeds)) + # + # src_embs = self._graph_embeddings(input_nodes, blocks, masked=masked) + # # logits, labels = self._logits_embedder(src_embs, ee, lp, seeds, neg_sampling_factor) + # node_embs_, element_embs_, labels = self._logits_embedder(src_embs, ee, lp, seeds, neg_sampling_factor) + # + # if self.measure_scores: + # if count % self.dilate_scores == 0: + # ndcg = self.target_embedder.score_candidates(self.seeds_to_global(seeds), src_embs, self.link_predictor, at=ndcg_at, type=self.link_predictor_type, device=self.device) + # for key, val in ndcg.items(): + # total_ndcg[key] = total_ndcg[key] + val + # ndcg_count += 1 + # + # # logits = self.link_predictor(node_embs_, element_embs_) + # # + # # logp = nn.functional.log_softmax(logits, 1) + # # loss = nn.functional.cross_entropy(logp, labels) + # # acc = _compute_accuracy(logp.argmax(dim=1), labels) + # + # acc, loss = self.compute_acc_loss(node_embs_, element_embs_, labels) + # + # total_loss += loss.item() + # total_acc += acc + # count += 1 + # return total_loss / count, total_acc / count, {key: val / ndcg_count for key, val in total_ndcg.items()} if self.measure_scores else None + # + # def _evaluate_nodes(self, ee, lp, create_api_call_loader, data_split, neg_sampling_factor=1): + # + # total_loss = 0 + # total_acc = 0 + # ndcg_at = [1, 3, 5, 10] + # total_ndcg = {f"ndcg@{k}": 0. for k in ndcg_at} + # ndcg_count = 0 + # count = 0 + # + # for input_nodes, seeds, blocks in tqdm( + # getattr(self, f"{data_split}_loader"), total=getattr(self, f"num_{data_split}_batches") + # ): + # blocks = [blk.to(self.device) for blk in blocks] + # + # if self.masker is None: + # masked = None + # else: + # masked = self.masker.get_mask(self.seeds_to_python(seeds)) + # + # src_embs = self._graph_embeddings(input_nodes, blocks, masked=masked) + # # logits, labels = self._logits_nodes(src_embs, ee, lp, create_api_call_loader, seeds, neg_sampling_factor) + # # node_embs_, element_embs_, labels = self._logits_nodes(src_embs, ee, lp, create_api_call_loader, seeds, neg_sampling_factor) + # + # node_embs_, element_embs_, labels = self.prepare_for_prediction( + # src_embs, seeds, self.target_embedding_fn, negative_factor=1, + # neg_sampling_strategy=None, train_embeddings=True, + # update_embeddings_for_queries=False + # ) + # + # if self.measure_scores: + # if count % self.dilate_scores == 0: + # ndcg = self.target_embedder.score_candidates(self.seeds_to_global(seeds), src_embs, self.link_predictor, at=ndcg_at, type=self.link_predictor_type, device=self.device) + # for key, val in ndcg.items(): + # total_ndcg[key] = total_ndcg[key] + val + # ndcg_count += 1 + # + # # logits = self.link_predictor(node_embs_, element_embs_) + # # + # # logp = nn.functional.log_softmax(logits, 1) + # # loss = nn.functional.cross_entropy(logp, labels) + # # acc = _compute_accuracy(logp.argmax(dim=1), labels) + # + # acc, loss = self.compute_acc_loss(node_embs_, element_embs_, labels) + # + # total_loss += loss.item() + # total_acc += acc + # count += 1 + # return total_loss / count, total_acc / count, {key: val / ndcg_count for key, val in total_ndcg.items()} if self.measure_scores else None + + def check_early_stopping(self, metric): + """ + Checks the metric value and raises Early Stopping when the metric stops increasing. + Assumes that the metric grows. Uses accuracy as a metric by default. Check implementation of child classes. + :param metric: metric value + :return: Nothing + """ + if self.early_stopping_tracker is not None: + self.early_stopping_trigger = self.early_stopping_tracker.should_stop( + metric) + + def evaluate(self, data_split, *, neg_sampling_strategy=None, early_stopping=False, early_stopping_tolerance=20): + # negative factor is 1 for evaluation + scores = self.evaluate_objective( + data_split, neg_sampling_strategy=None, negative_factor=1) + # if data_split == "val": + # self.check_early_stopping(scores["Accuracy"]) + return scores @abstractmethod def parameters(self, recurse: bool = True): - # if self.type in {"subword_ranker"}: - # return chain(self.target_embedder.parameters(), self.link_predictor.parameters()) - # elif self.type in {"graph_link_prediction", "graph_link_classification"}: - # return self.link_predictor.parameters() - # else: - # raise NotImplementedError() raise NotImplementedError() @abstractmethod def custom_state_dict(self): - # state_dict = OrderedDict() - # if self.type in {"subword_ranker"}: - # for k, v in self.target_embedder.state_dict().items(): - # state_dict[f"target_embedder.{k}"] = v - # for k, v in self.link_predictor.state_dict().items(): - # state_dict[f"link_predictor.{k}"] = v - # # state_dict["target_embedder"] = self.target_embedder.state_dict() - # # state_dict["link_predictor"] = self.link_predictor.state_dict() - # elif self.type in {"graph_link_prediction", "graph_link_classification"}: - # for k, v in self.link_predictor.state_dict().items(): - # state_dict[f"link_predictor.{k}"] = v - # # state_dict["link_predictor"] = self.link_predictor.state_dict() - # else: - # raise NotImplementedError() - # - # return state_dict raise NotImplementedError() @abstractmethod def custom_load_state_dict(self, state_dicts): - # if self.type in {"subword_ranker"}: - # self.target_embedder.load_state_dict( - # self.get_prefix("target_embedder", state_dicts) - # ) - # self.link_predictor.load_state_dict( - # self.get_prefix("link_predictor", state_dicts) - # ) - # elif self.type in {"graph_link_prediction", "graph_link_classification"}: - # self.link_predictor.load_state_dict( - # self.get_prefix("link_predictor", state_dicts) - # ) - # else: - # raise NotImplementedError() raise NotImplementedError() def get_prefix(self, prefix, state_dict): diff --git a/SourceCodeTools/models/graph/train/objectives/GraphLinkClassificationObjective.py b/SourceCodeTools/models/graph/train/objectives/GraphLinkClassificationObjective.py index 0cd50be7..4f762541 100644 --- a/SourceCodeTools/models/graph/train/objectives/GraphLinkClassificationObjective.py +++ b/SourceCodeTools/models/graph/train/objectives/GraphLinkClassificationObjective.py @@ -1,15 +1,15 @@ -from collections import OrderedDict import numpy as np import random as rnd import torch from torch.nn import CrossEntropyLoss -from SourceCodeTools.code.data.sourcetrail.SubwordMasker import SubwordMasker +from SourceCodeTools.code.data.dataset.SubwordMasker import SubwordMasker +from SourceCodeTools.mltools.torch import compute_accuracy +from SourceCodeTools.models.graph.ElementEmbedder import GraphLinkSampler from SourceCodeTools.models.graph.ElementEmbedderBase import ElementEmbedderBase -from SourceCodeTools.models.graph.LinkPredictor import LinkClassifier, BilinearLinkPedictor +from SourceCodeTools.models.graph.LinkPredictor import BilinearLinkPedictor, TransRLinkPredictor from SourceCodeTools.models.graph.train.Scorer import Scorer -from SourceCodeTools.models.graph.train.objectives import GraphLinkObjective -from SourceCodeTools.models.graph.train.sampling_multitask2 import _compute_accuracy +from SourceCodeTools.models.graph.train.objectives.GraphLinkObjective import GraphLinkObjective from SourceCodeTools.tabular.common import compact_property @@ -18,69 +18,32 @@ def __init__( self, name, graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", masker: SubwordMasker = None, - measure_ndcg=False, dilate_ndcg=1 + measure_scores=False, dilate_scores=1, ns_groups=None ): super().__init__( name, graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=tokenizer_path, target_emb_size=target_emb_size, link_predictor_type=link_predictor_type, - masker=masker, measure_ndcg=measure_ndcg, dilate_ndcg=dilate_ndcg + masker=masker, measure_scores=measure_scores, dilate_scores=dilate_scores, ns_groups=ns_groups + ) + self.measure_scores = True + self.update_embeddings_for_queries = True + + def create_graph_link_sampler(self, data_loading_func, nodes): + self.target_embedder = TargetLinkMapper( + elements=data_loading_func(), nodes=nodes, emb_size=self.target_emb_size, ns_groups=self.ns_groups ) - self.measure_ndcg = False def create_link_predictor(self): - self.link_predictor = BilinearLinkPedictor(self.target_emb_size, self.graph_model.emb_size, self.target_embedder.num_classes).to(self.device) + self.link_predictor = BilinearLinkPedictor( + self.target_emb_size, self.graph_model.emb_size, self.target_embedder.num_classes + ).to(self.device) # self.positive_label = 1 - self.negative_label = 0 + self.negative_label = self.target_embedder.null_class self.label_dtype = torch.long - def _logits_nodes(self, node_embeddings, - elem_embedder, link_predictor, create_dataloader, - src_seeds, negative_factor=1, train_embeddings=True): - k = negative_factor - indices = self.seeds_to_global(src_seeds).tolist() - batch_size = len(indices) - - node_embeddings_batch = node_embeddings - dst_indices, labels_pos = elem_embedder[indices] # this assumes indices is torch tensor - - # dst targets are not unique - unique_dst, slice_map = self._handle_non_unique(dst_indices) - assert unique_dst[slice_map].tolist() == dst_indices.tolist() - - dataloader = create_dataloader(unique_dst) - input_nodes, dst_seeds, blocks = next(iter(dataloader)) - blocks = [blk.to(self.device) for blk in blocks] - assert dst_seeds.shape == unique_dst.shape - assert dst_seeds.tolist() == unique_dst.tolist() - unique_dst_embeddings = self._logits_batch(input_nodes, blocks, train_embeddings) # use_types, ntypes) - dst_embeddings = unique_dst_embeddings[slice_map.to(self.device)] - - self.target_embedder.set_embed(unique_dst.detach().cpu().numpy(), unique_dst_embeddings.detach().cpu().numpy()) - - node_embeddings_neg_batch = node_embeddings_batch.repeat(k, 1) - # negative_indices = torch.tensor(elem_embedder.sample_negative( - # batch_size * k), dtype=torch.long) # embeddings are sampled from 3/4 unigram distribution - negative_indices = torch.tensor(elem_embedder.sample_negative( - batch_size * k, ids=indices), dtype=torch.long) # closest negative - unique_negative, slice_map = self._handle_non_unique(negative_indices) - assert unique_negative[slice_map].tolist() == negative_indices.tolist() - - dataloader = create_dataloader(unique_negative) - input_nodes, dst_seeds, blocks = next(iter(dataloader)) - blocks = [blk.to(self.device) for blk in blocks] - assert dst_seeds.shape == unique_negative.shape - assert dst_seeds.tolist() == unique_negative.tolist() - unique_negative_random = self._logits_batch(input_nodes, blocks, train_embeddings) # use_types, ntypes) - negative_random = unique_negative_random[slice_map.to(self.device)] - labels_neg = torch.full((batch_size * k,), self.negative_label, dtype=self.label_dtype) - - self.target_embedder.set_embed(unique_negative.detach().cpu().numpy(), unique_negative_random.detach().cpu().numpy()) - - nodes = torch.cat([node_embeddings_batch, node_embeddings_neg_batch], dim=0) - embs = torch.cat([dst_embeddings, negative_random], dim=0) - labels = torch.cat([labels_pos, labels_neg], 0).to(self.device) - return nodes, embs, labels + def create_positive_labels(self, ids): + return torch.LongTensor(self.target_embedder.get_labels(ids)) def compute_acc_loss(self, node_embs_, element_embs_, labels): logits = self.link_predictor(node_embs_, element_embs_) @@ -89,14 +52,52 @@ def compute_acc_loss(self, node_embs_, element_embs_, labels): loss = loss_fct(logits.reshape(-1, logits.size(-1)), labels.reshape(-1)) - acc = _compute_accuracy(logits.argmax(dim=1), labels) + acc = compute_accuracy(logits.argmax(dim=1), labels) return acc, loss -class TargetLinkMapper(ElementEmbedderBase, Scorer): - def __init__(self, elements, nodes): - ElementEmbedderBase.__init__(self, elements=elements, nodes=nodes, ) +class TransRObjective(GraphLinkClassificationObjective): + def __init__( + self, graph_model, node_embedder, nodes, data_loading_func, device, + sampling_neighbourhood_size, batch_size, + tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", masker: SubwordMasker = None, + measure_scores=False, dilate_scores=1, ns_groups=None + ): + super().__init__( + "TransR", graph_model, node_embedder, nodes, data_loading_func, device, + sampling_neighbourhood_size, batch_size, + tokenizer_path=tokenizer_path, target_emb_size=target_emb_size, link_predictor_type=link_predictor_type, + masker=masker, measure_scores=measure_scores, dilate_scores=dilate_scores, ns_groups=ns_groups + ) + + def create_link_predictor(self): + self.link_predictor = TransRLinkPredictor( + input_dim=self.target_emb_size, rel_dim=30, + num_relations=self.target_embedder.num_classes + ).to(self.device) + # self.positive_label = 1 + self.negative_label = -1 + self.label_dtype = torch.long + + def compute_acc_loss(self, node_embs_, element_embs_, labels): + + num_examples = len(labels) // 2 + anchor = node_embs_[:num_examples, :] + positive = element_embs_[:num_examples, :] + negative = element_embs_[num_examples:, :] + labels_ = labels[:num_examples] + + loss, sim = self.link_predictor(anchor, positive, negative, labels_) + acc = compute_accuracy(sim, labels >= 0) + + return acc, loss + +class TargetLinkMapper(GraphLinkSampler): + def __init__(self, elements, nodes, emb_size=1, ns_groups=None): + super(TargetLinkMapper, self).__init__( + elements, nodes, compact_dst=False, dst_to_global=True, emb_size=emb_size, ns_groups=ns_groups + ) def init(self, compact_dst): if compact_dst: @@ -106,7 +107,7 @@ def init(self, compact_dst): self.elements['emb_id'] = self.elements['dst'] self.link_type2id, self.inverse_link_type_map = compact_property(self.elements['type'], return_order=True, index_from_one=True) - self.elements["link_type"] = self.elements["type"].apply(lambda x: self.link_type2id[x]) + self.elements["link_type"] = list(map(lambda x: self.link_type2id[x], self.elements["type"].tolist())) self.element_lookup = {} for id_, emb_id, link_type in self.elements[["id", "emb_id", "link_type"]].values: @@ -117,12 +118,22 @@ def init(self, compact_dst): self.init_neg_sample() self.num_classes = len(self.inverse_link_type_map) + self.null_class = 0 def __getitem__(self, ids): + self.cached_ids = ids node_ids, labels = zip(*(rnd.choice(self.element_lookup[id]) for id in ids)) - return np.array(node_ids, dtype=np.int32), np.array(labels, dtype=np.int32) + self.cached_labels = list(labels) + return np.array(node_ids, dtype=np.int32)#, torch.LongTensor(np.array(labels, dtype=np.int32)) + + def get_labels(self, ids): + if self.cached_ids == ids: + return self.cached_labels + else: + node_ids, labels = zip(*(rnd.choice(self.element_lookup[id]) for id in ids)) + return list(labels) - def sample_negative(self, size, ids=None, strategy="closest"): # TODO switch to w2v? + def sample_negative(self, size, ids=None, strategy="w2v"): # TODO switch to w2v? if strategy == "w2v": negative = ElementEmbedderBase.sample_negative(self, size) else: diff --git a/SourceCodeTools/models/graph/train/objectives/GraphLinkObjective.py b/SourceCodeTools/models/graph/train/objectives/GraphLinkObjective.py index 3aaaedde..00258c50 100644 --- a/SourceCodeTools/models/graph/train/objectives/GraphLinkObjective.py +++ b/SourceCodeTools/models/graph/train/objectives/GraphLinkObjective.py @@ -1,9 +1,6 @@ from collections import OrderedDict -import torch - -from SourceCodeTools.code.data.sourcetrail.SubwordMasker import SubwordMasker -from SourceCodeTools.models.graph.LinkPredictor import LinkClassifier +from SourceCodeTools.code.data.dataset.SubwordMasker import SubwordMasker from SourceCodeTools.models.graph.train.objectives.AbstractObjective import AbstractObjective @@ -12,14 +9,18 @@ def __init__( self, name, graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", masker: SubwordMasker = None, - measure_ndcg=False, dilate_ndcg=1 + measure_scores=False, dilate_scores=1, nn_index="brute", ns_groups=None ): super(GraphLinkObjective, self).__init__( name, graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=tokenizer_path, target_emb_size=target_emb_size, link_predictor_type=link_predictor_type, - masker=masker, measure_ndcg=measure_ndcg, dilate_ndcg=dilate_ndcg + masker=masker, measure_scores=measure_scores, dilate_scores=dilate_scores, nn_index=nn_index, + ns_groups=ns_groups ) + self.target_embedding_fn = self.get_targets_from_nodes + self.negative_factor = 1 + self.update_embeddings_for_queries = True def verify_parameters(self): pass @@ -27,34 +28,6 @@ def verify_parameters(self): def create_target_embedder(self, data_loading_func, nodes, tokenizer_path): self.create_graph_link_sampler(data_loading_func, nodes) - def create_link_predictor(self): - if self.link_predictor_type == "nn": - self.create_nn_link_predictor() - elif self.link_predictor_type == "inner_prod": - self.create_inner_prod_link_predictor() - else: - raise NotImplementedError() - - def forward(self, input_nodes, seeds, blocks, train_embeddings=True): - masked = None - graph_emb = self._logits_batch(input_nodes, blocks, train_embeddings, masked=masked) - node_embs_, element_embs_, labels = self._logits_nodes( - graph_emb, self.target_embedder, self.link_predictor, - self._create_loader, seeds, train_embeddings=train_embeddings - ) - - acc, loss = self.compute_acc_loss(node_embs_, element_embs_, labels) - - return loss, acc - - def evaluate(self, data_split, neg_sampling_factor=1): - loss, acc, ndcg = self._evaluate_nodes( - self.target_embedder, self.link_predictor, self._create_loader, data_split=data_split, - neg_sampling_factor=neg_sampling_factor - ) - # ndcg = None - return loss, acc, ndcg - def parameters(self, recurse: bool = True): return self.link_predictor.parameters() @@ -69,86 +42,37 @@ def custom_load_state_dict(self, state_dicts): self.get_prefix("link_predictor", state_dicts) ) - def _logits_nodes(self, node_embeddings, - elem_embedder, link_predictor, create_dataloader, - src_seeds, negative_factor=1, train_embeddings=True): - k = negative_factor - indices = self.seeds_to_global(src_seeds).tolist() - batch_size = len(indices) - - node_embeddings_batch = node_embeddings - next_call_indices = elem_embedder[indices] # this assumes indices is torch tensor - - # dst targets are not unique - unique_dst, slice_map = self._handle_non_unique(next_call_indices) - assert unique_dst[slice_map].tolist() == next_call_indices.tolist() - - dataloader = create_dataloader(unique_dst) - input_nodes, dst_seeds, blocks = next(iter(dataloader)) - blocks = [blk.to(self.device) for blk in blocks] - assert dst_seeds.shape == unique_dst.shape - assert dst_seeds.tolist() == unique_dst.tolist() - unique_dst_embeddings = self._logits_batch(input_nodes, blocks, train_embeddings) # use_types, ntypes) - next_call_embeddings = unique_dst_embeddings[slice_map.to(self.device)] - labels_pos = torch.full((batch_size,), self.positive_label, dtype=self.label_dtype) - - node_embeddings_neg_batch = node_embeddings_batch.repeat(k, 1) - # negative_indices = torch.tensor(elem_embedder.sample_negative( - # batch_size * k), dtype=torch.long) # embeddings are sampled from 3/4 unigram distribution - negative_indices = torch.tensor(elem_embedder.sample_negative( - batch_size * k, ids=indices), dtype=torch.long) # closest negative - unique_negative, slice_map = self._handle_non_unique(negative_indices) - assert unique_negative[slice_map].tolist() == negative_indices.tolist() - - self.target_embedder.set_embed(unique_dst.detach().cpu().numpy(), unique_dst_embeddings.detach().cpu().numpy()) - - dataloader = create_dataloader(unique_negative) - input_nodes, dst_seeds, blocks = next(iter(dataloader)) - blocks = [blk.to(self.device) for blk in blocks] - assert dst_seeds.shape == unique_negative.shape - assert dst_seeds.tolist() == unique_negative.tolist() - unique_negative_random = self._logits_batch(input_nodes, blocks, train_embeddings) # use_types, ntypes) - negative_random = unique_negative_random[slice_map.to(self.device)] - labels_neg = torch.full((batch_size * k,), self.negative_label, dtype=self.label_dtype) - - self.target_embedder.set_embed(unique_negative.detach().cpu().numpy(), unique_negative_random.detach().cpu().numpy()) - - nodes = torch.cat([node_embeddings_batch, node_embeddings_neg_batch], dim=0) - embs = torch.cat([next_call_embeddings, negative_random], dim=0) - labels = torch.cat([labels_pos, labels_neg], 0).to(self.device) - return nodes, embs, labels - - -class GraphLinkTypeObjective(GraphLinkObjective): - def __init__( - self, name, graph_model, node_embedder, nodes, data_loading_func, device, - sampling_neighbourhood_size, batch_size, - tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", masker: SubwordMasker = None, - measure_ndcg=False, dilate_ndcg=1 - ): - self.set_num_classes(data_loading_func) - - super(GraphLinkObjective, self).__init__( - name, graph_model, node_embedder, nodes, data_loading_func, device, - sampling_neighbourhood_size, batch_size, - tokenizer_path=tokenizer_path, target_emb_size=target_emb_size, link_predictor_type=link_predictor_type, - masker=masker, measure_ndcg=measure_ndcg, dilate_ndcg=dilate_ndcg - ) - - def set_num_classes(self, data_loading_func): - pass - - def create_nn_link_type_predictor(self): - self.link_predictor = LinkClassifier(2 * self.graph_model.emb_size, self.num_classes).to(self.device) - self.positive_label = 1 - self.negative_label = 0 - self.label_dtype = torch.long - def create_link_predictor(self): - if self.link_predictor_type == "nn": - self.create_nn_link_predictor() - elif self.link_predictor_type == "inner_prod": - self.create_inner_prod_link_predictor() - else: - raise NotImplementedError() +# class GraphLinkTypeObjective(GraphLinkObjective): +# def __init__( +# self, name, graph_model, node_embedder, nodes, data_loading_func, device, +# sampling_neighbourhood_size, batch_size, +# tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", masker: SubwordMasker = None, +# measure_scores=False, dilate_scores=1 +# ): +# self.set_num_classes(data_loading_func) +# +# super(GraphLinkObjective, self).__init__( +# name, graph_model, node_embedder, nodes, data_loading_func, device, +# sampling_neighbourhood_size, batch_size, +# tokenizer_path=tokenizer_path, target_emb_size=target_emb_size, link_predictor_type=link_predictor_type, +# masker=masker, measure_scores=measure_scores, dilate_scores=dilate_scores +# ) +# +# def set_num_classes(self, data_loading_func): +# pass +# +# def create_nn_link_type_predictor(self): +# self.link_predictor = LinkClassifier(2 * self.graph_model.emb_size, self.num_classes).to(self.device) +# self.positive_label = 1 +# self.negative_label = 0 +# self.label_dtype = torch.long +# +# def create_link_predictor(self): +# if self.link_predictor_type == "nn": +# self.create_nn_link_predictor() +# elif self.link_predictor_type == "inner_prod": +# self.create_inner_prod_link_predictor() +# else: +# raise NotImplementedError() diff --git a/SourceCodeTools/models/graph/train/objectives/NodeClassificationObjective.py b/SourceCodeTools/models/graph/train/objectives/NodeClassificationObjective.py index cd568ec8..39979cdf 100644 --- a/SourceCodeTools/models/graph/train/objectives/NodeClassificationObjective.py +++ b/SourceCodeTools/models/graph/train/objectives/NodeClassificationObjective.py @@ -1,31 +1,33 @@ -from collections import OrderedDict +import logging +from collections import OrderedDict, defaultdict from itertools import chain import torch -from sklearn.metrics import ndcg_score +from sklearn.metrics import ndcg_score, top_k_accuracy_score from torch import nn from torch.nn import CrossEntropyLoss -from SourceCodeTools.code.data.sourcetrail import SubwordMasker -from SourceCodeTools.models.graph.train.objectives.AbstractObjective import AbstractObjective, _compute_accuracy +from SourceCodeTools.code.data.dataset import SubwordMasker +from SourceCodeTools.models.graph.train.objectives.AbstractObjective import AbstractObjective, compute_accuracy, \ + sum_scores from SourceCodeTools.models.graph.ElementEmbedderBase import ElementEmbedderBase from SourceCodeTools.models.graph.train.Scorer import Scorer import numpy as np -class NodeNameClassifier(AbstractObjective): +class NodeClassifierObjective(AbstractObjective): def __init__( - self, graph_model, node_embedder, nodes, data_loading_func, device, + self, name, graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=None, target_emb_size=None, link_predictor_type=None, masker: SubwordMasker = None, - measure_ndcg=False, dilate_ndcg=1 + measure_scores=False, dilate_scores=1, early_stopping=False, early_stopping_tolerance=20 ): super().__init__( - "NodeNameClassifier", graph_model, node_embedder, nodes, data_loading_func, device, + name, graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=tokenizer_path, target_emb_size=target_emb_size, link_predictor_type=link_predictor_type, - masker=masker, measure_ndcg=measure_ndcg, dilate_ndcg=dilate_ndcg + masker=masker, measure_scores=measure_scores, dilate_scores=dilate_scores, early_stopping=early_stopping, early_stopping_tolerance=early_stopping_tolerance ) def create_target_embedder(self, data_loading_func, nodes, tokenizer_path): @@ -34,65 +36,98 @@ def create_target_embedder(self, data_loading_func, nodes, tokenizer_path): ) def create_link_predictor(self): - self.classifier = NodeClassifier(self.target_emb_size, self.target_embedder.num_classes).to(self.device) + self.classifier = NodeClassifier( + self.target_emb_size, self.target_embedder.num_classes).to(self.device) - def compute_acc_loss(self, graph_emb, labels, return_logits=False): + def compute_acc_loss(self, graph_emb, element_emb, labels, return_logits=False): logits = self.classifier(graph_emb) loss_fct = CrossEntropyLoss(ignore_index=-100) loss = loss_fct(logits.reshape(-1, logits.size(-1)), labels.reshape(-1)) - acc = _compute_accuracy(logits.argmax(dim=1), labels) + acc = compute_accuracy(logits.argmax(dim=1), labels) if return_logits: return acc, loss, logits return acc, loss - def forward(self, input_nodes, seeds, blocks, train_embeddings=True): - masked = self.masker.get_mask(self.seeds_to_python(seeds)) if self.masker is not None else None - graph_emb = self._logits_batch(input_nodes, blocks, train_embeddings, masked=masked) + def prepare_for_prediction( + self, node_embeddings, seeds, target_embedding_fn, negative_factor=1, + neg_sampling_strategy=None, train_embeddings=True, + ): indices = self.seeds_to_global(seeds).tolist() - labels = torch.LongTensor(self.target_embedder[indices]).to(self.device) - acc, loss = self.compute_acc_loss(graph_emb, labels) + labels = torch.LongTensor( + self.target_embedder[indices]).to(self.device) - return loss, acc + return node_embeddings, None, labels - def evaluate_generation(self, data_split): - total_loss = 0 - total_acc = 0 - ndcg_at = [1, 3, 5, 10] - total_ndcg = {f"ndcg@{k}": 0. for k in ndcg_at} - ndcg_count = 0 + def evaluate_objective(self, data_split, neg_sampling_strategy=None, negative_factor=1): + at = [1, 3, 5, 10] count = 0 + scores = defaultdict(list) for input_nodes, seeds, blocks in getattr(self, f"{data_split}_loader"): blocks = [blk.to(self.device) for blk in blocks] - src_embs = self._logits_batch(input_nodes, blocks) - indices = self.seeds_to_global(seeds).tolist() - labels = self.target_embedder[indices] - labels = torch.LongTensor(labels).to(self.device) - acc, loss, logits = self.compute_acc_loss(src_embs, labels, return_logits=True) + if self.masker is None: + masked = None + else: + masked = self.masker.get_mask(self.seeds_to_python(seeds)) + + src_embs = self._graph_embeddings( + input_nodes, blocks, masked=masked) + node_embs_, element_embs_, labels = self.prepare_for_prediction( + src_embs, seeds, self.target_embedding_fn, negative_factor=negative_factor, + neg_sampling_strategy=neg_sampling_strategy, + train_embeddings=False + ) + # indices = self.seeds_to_global(seeds).tolist() + # labels = self.target_embedder[indices] + # labels = torch.LongTensor(labels).to(self.device) + acc, loss, logits = self.compute_acc_loss( + node_embs_, element_embs_, labels, return_logits=True) y_pred = nn.functional.softmax(logits, dim=-1).to("cpu").numpy() y_true = np.zeros(y_pred.shape) - y_true[np.arange(0, y_true.shape[0]), labels.to("cpu").numpy()] = 1. - - if self.measure_ndcg: - if count % self.dilate_ndcg == 0: - for k in ndcg_at: - total_ndcg[f"ndcg@{k}"] += ndcg_score(y_true, y_pred, k=k) - ndcg_count += 1 + y_true[np.arange(0, y_true.shape[0]), + labels.to("cpu").numpy()] = 1. + + if self.measure_scores: + if y_pred.shape[1] == 2: + logging.warning( + "Scores are meaningless for binary classification. Disabling.") + self.measure_scores = False + else: + if count % self.dilate_scores == 0: + y_true_onehot = np.array(y_true) + labels = list(range(y_true_onehot.shape[1])) + + for k in at: + # do not measure for binary classification + if k >= y_pred.shape[1]: + if not hasattr(self, f"meaning_scores_warning_{k}"): + logging.warning( + f"Disabling @{k} scores for task with {y_pred.shape[1]} classes") + setattr( + self, f"meaning_scores_warning_{k}", True) + continue # scores do not have much sense in this situation + scores[f"ndcg@{k}"].append( + ndcg_score(y_true, y_pred, k=k)) + scores[f"acc@{k}"].append( + top_k_accuracy_score( + y_true_onehot.argmax(-1), y_pred, k=k, labels=labels) + ) + + scores["Loss"].append(loss.item()) + scores["Accuracy"].append(acc) + count += 1 - total_loss += loss.item() - total_acc += acc + if count == 0: count += 1 - return total_loss / count, total_acc / count, {key: val / ndcg_count for key, val in total_ndcg.items()} if self.measure_ndcg else None - def evaluate(self, data_split, neg_sampling_factor=1): - loss, acc, bleu = self.evaluate_generation(data_split) - return loss, acc, bleu + scores = {key: sum_scores(val) for key, val in scores.items()} + return scores def parameters(self, recurse: bool = True): return chain(self.classifier.parameters()) @@ -100,10 +135,25 @@ def parameters(self, recurse: bool = True): def custom_state_dict(self): state_dict = OrderedDict() for k, v in self.classifier.state_dict().items(): - state_dict[f"target_embedder.{k}"] = v + state_dict[f"classifier.{k}"] = v return state_dict +class NodeNameClassifier(NodeClassifierObjective): + def __init__( + self, graph_model, node_embedder, nodes, data_loading_func, device, + sampling_neighbourhood_size, batch_size, + tokenizer_path=None, target_emb_size=None, link_predictor_type=None, masker: SubwordMasker = None, + measure_scores=False, dilate_scores=1, early_stopping=False, early_stopping_tolerance=20 + ): + super().__init__( + "NodeNameClassifier", graph_model, node_embedder, nodes, data_loading_func, device, + sampling_neighbourhood_size, batch_size, + tokenizer_path=tokenizer_path, target_emb_size=target_emb_size, link_predictor_type=link_predictor_type, + masker=masker, measure_scores=measure_scores, dilate_scores=dilate_scores, early_stopping=early_stopping, early_stopping_tolerance=early_stopping_tolerance + ) + + class ClassifierTargetMapper(ElementEmbedderBase, Scorer): def __init__(self, elements, nodes): ElementEmbedderBase.__init__(self, elements=elements, nodes=nodes,) @@ -112,14 +162,14 @@ def __init__(self, elements, nodes): def set_embed(self, *args, **kwargs): pass - def prepare_index(self): + def prepare_index(self, *args): pass class NodeClassifier(nn.Module): def __init__(self, input_dims, num_classes, hidden=100): super().__init__() - + print('id', input_dims, 'nc', num_classes) self.l1 = nn.Linear(input_dims, hidden) self.l1_a = nn.LeakyReLU() @@ -127,4 +177,4 @@ def __init__(self, input_dims, num_classes, hidden=100): def forward(self, x, **kwargs): x = self.l1_a(self.l1(x)) - return self.logits(x) \ No newline at end of file + return self.logits(x) diff --git a/SourceCodeTools/models/graph/train/objectives/SCAAClassifierObjetive.py b/SourceCodeTools/models/graph/train/objectives/SCAAClassifierObjetive.py new file mode 100644 index 00000000..c156cc52 --- /dev/null +++ b/SourceCodeTools/models/graph/train/objectives/SCAAClassifierObjetive.py @@ -0,0 +1,79 @@ +from collections import OrderedDict +from itertools import chain + +import torch +import torch.nn.functional as F + +from SourceCodeTools.models.graph.train.objectives.SubgraphEmbedderObjective import SubgraphMatchingObjective + + +class PoolingLayer(torch.nn.Module): + + def __init__(self, k, shape=1): + super().__init__() + self.learnable_vector = torch.nn.Parameter(torch.randn(shape)) + self.learnable_vector.requires_grad = True + self.k = k + + def forward(self, x): + length = torch.norm(self.learnable_vector) + y = torch.mm(x, self.learnable_vector)/length + if (y.shape[0] < self.k): + y = F.pad(input=y, pad=( + 0, 0, 0, self.k - y.shape[0]), mode='constant', value=0) + x = F.pad(input=x, pad=( + 0, 0, 0, self.k - x.shape[0]), mode='constant', value=0) + idx = torch.topk(y, self.k, dim=0) + y = torch.gather(y, 0, idx.indices) + y = torch.sigmoid(y) + x_part = torch.gather(x, 0, idx.indices) + return torch.mul(x_part, y) + + +class SCAAClassifierObjective(SubgraphMatchingObjective): + + def __init__( + self, name, graph_model, node_embedder, nodes, data_loading_func, device, + sampling_neighbourhood_size, batch_size, + tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", + masker=None, measure_scores=False, dilate_scores=1, + early_stopping=False, early_stopping_tolerance=20, nn_index="brute", + ns_groups=None, subgraph_mapping=None, subgraph_partition=None + ): + SubgraphMatchingObjective.__init__(self, + name, graph_model, node_embedder, nodes, data_loading_func, device, + sampling_neighbourhood_size, batch_size, + tokenizer_path, target_emb_size, link_predictor_type, + masker, measure_scores, dilate_scores, early_stopping, early_stopping_tolerance, nn_index, + ns_groups, subgraph_mapping, subgraph_partition + ) + self.pooler = PoolingLayer(100, (100, 1)) + # self.pooler = torch.nn.MultiheadAttention( + # 100, 4, device=device, batch_first=True) + + def parameters(self, recurse: bool = True): + return chain(self.link_predictor.parameters(), self.pooler.parameters()) + + def pooling_fn(self, node_embeddings): + return self.pooler(node_embeddings) + + # node_embeddings = torch.reshape(node_embeddings, (1, -1, 100)) + # pool = self.pooler(node_embeddings, node_embeddings, + # node_embeddings, need_weights=False)[0][0] + # return torch.transpose(torch.mean(pool, dim=0, keepdim=True), 0, 1) + + def custom_state_dict(self): + state_dict = OrderedDict() + for k, v in self.link_predictor.state_dict().items(): + state_dict[f"classifier.{k}"] = v + for k, v in self.pooler.state_dict().items(): + state_dict[f"pooler.{k}"] = v + return state_dict + + def custom_load_state_dict(self, state_dicts): + self.link_predictor.load_state_dict( + self.get_prefix("classifier", state_dicts) + ) + self.pooler.load_state_dict( + self.get_prefix("pooler", state_dicts) + ) diff --git a/SourceCodeTools/models/graph/train/objectives/SubgraphClassifierObjective.py b/SourceCodeTools/models/graph/train/objectives/SubgraphClassifierObjective.py new file mode 100644 index 00000000..20d2fbce --- /dev/null +++ b/SourceCodeTools/models/graph/train/objectives/SubgraphClassifierObjective.py @@ -0,0 +1,420 @@ +import logging +from collections import OrderedDict, defaultdict +from itertools import chain +from typing import Optional + +import numpy as np +import torch +from sklearn.metrics import ndcg_score, top_k_accuracy_score +from torch import nn +from tqdm import tqdm + +from SourceCodeTools.code.data.dataset.SubwordMasker import SubwordMasker +from SourceCodeTools.code.data.file_utils import unpersist +from SourceCodeTools.models.graph.ElementEmbedder import ElementEmbedderWithBpeSubwords +from SourceCodeTools.models.graph.ElementEmbedderBase import ElementEmbedderBase +from SourceCodeTools.models.graph.train.Scorer import Scorer +from SourceCodeTools.models.graph.train.objectives.AbstractObjective import AbstractObjective, sum_scores +from SourceCodeTools.models.graph.train.objectives.NodeClassificationObjective import NodeClassifier, \ + NodeClassifierObjective +from SourceCodeTools.tabular.common import compact_property + + +class SubgraphLoader: + def __init__(self, ids, subgraph_mapping, loading_fn, batch_size, graph_node_types): + self.ids = ids + self.loading_fn = loading_fn + self.subgraph_mapping = subgraph_mapping + self.iterator = None + self.batch_size = batch_size + self.graph_node_types = graph_node_types + + def load_ids(self, batch_ids): + node_ids = dict() + subgraphs = dict() + + for id_ in batch_ids: + subgraph_nodes = self._get_subgraph(id_) + subgraphs[id_] = subgraph_nodes + + for type_ in subgraph_nodes: + if type_ not in node_ids: + node_ids[type_] = set() + node_ids[type_].update(subgraph_nodes[type_]) + + for type_ in node_ids: + node_ids[type_] = sorted(list(node_ids[type_])) + + coincidence_matrix = [] + for id_, subgraph in subgraphs.items(): + coincidence_matrix.append([]) + for type_ in self.graph_node_types: + subgraph_nodes = subgraph[type_] + for node_id in node_ids[type_]: + coincidence_matrix[-1].append(node_id in subgraph_nodes) + + coincidence_matrix = torch.BoolTensor(coincidence_matrix) + + loader = self.loading_fn(node_ids) + + for input_nodes, seeds, blocks in loader: + # generator contains only one batch + return input_nodes, (coincidence_matrix, torch.LongTensor(batch_ids)), blocks + + def __iter__(self): + # TODO + # supports only nodes without types + + for i in range(0, len(self.ids), self.batch_size): + batch_ids = self.ids[i: i + self.batch_size] + yield self.load_ids(batch_ids) + + # for id_ in self.ids: + # idx = self._get_subgraph(id_) + # loader = self.loading_fn(idx) + # for input_nodes, seeds, blocks in loader: + # yield input_nodes, torch.LongTensor([id_]), blocks + + def _get_subgraph(self, id_): + if isinstance(id_, torch.Tensor): + id_ = id_.item() + return self.subgraph_mapping[id_] + + +class SubgraphAbstractObjective(AbstractObjective): + def __init__( + self, name, graph_model, node_embedder, nodes, data_loading_func, device, + sampling_neighbourhood_size, batch_size, + tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", + masker: Optional[SubwordMasker] = None, measure_scores=False, dilate_scores=1, + early_stopping=False, early_stopping_tolerance=20, nn_index="brute", + ns_groups=None, subgraph_mapping=None, subgraph_partition=None + ): + assert subgraph_partition is not None, "Provide train/val/test splits with `subgraph_partition` option" + self.subgraph_mapping = subgraph_mapping + self.subgraph_partition = unpersist(subgraph_partition) + super(SubgraphAbstractObjective, self).__init__( + name, graph_model, node_embedder, nodes, data_loading_func, device, + sampling_neighbourhood_size, batch_size, + tokenizer_path, target_emb_size, link_predictor_type, masker, + measure_scores, dilate_scores, early_stopping, early_stopping_tolerance, nn_index, + ns_groups + ) + + self.target_embedding_fn = self.get_targets_from_embedder + self.negative_factor = 1 + self.update_embeddings_for_queries = False + + # def create_target_embedder(self, data_loading_func, nodes, tokenizer_path): + # self.target_embedder = SubgraphElementEmbedderWithSubwords( + # data_loading_func(), self.target_emb_size, tokenizer_path + # ) + + def _get_training_targets(self): + + if hasattr(self.graph_model.g, 'ntypes'): + self.ntypes = self.graph_model.g.ntypes + self.use_types = True + + if len(self.graph_model.g.ntypes) == 1: + self.use_types = False + else: + # not sure when this is called + raise NotImplementedError() + + train_idx = self.subgraph_partition[ + self.subgraph_partition["train_mask"] + ]["id"].to_numpy() + val_idx = self.subgraph_partition[ + self.subgraph_partition["val_mask"] + ]["id"].to_numpy() + test_idx = self.subgraph_partition[ + self.subgraph_partition["test_mask"] + ]["id"].to_numpy() + + return train_idx, val_idx, test_idx + + def create_loaders(self): + print("Number of nodes", self.graph_model.g.number_of_nodes()) + train_idx, val_idx, test_idx = self._get_training_targets() + train_idx, val_idx, test_idx = self.target_embedder.create_idx_pools( + train_idx=train_idx, val_idx=val_idx, test_idx=test_idx + ) + logging.info( + f"Pool sizes for {self.name}: train {self._idx_len(train_idx)}, " + f"val {self._idx_len(val_idx)}, " + f"test {self._idx_len(test_idx)}." + ) + loaders = self._get_loaders( + train_idx=train_idx, val_idx=val_idx, test_idx=test_idx, + batch_size=self.batch_size # batch_size_node_name + ) + self.train_loader, self.val_loader, self.test_loader = loaders + + # def get_num_nodes(ids): + # return sum(len(ids[key_]) for key_ in ids) // self.batch_size + 1 + + self.num_train_batches = len(train_idx) // self.batch_size + 1 + self.num_test_batches = len(test_idx) // self.batch_size + 1 + self.num_val_batches = len(val_idx) // self.batch_size + 1 + + def _get_loaders(self, train_idx, val_idx, test_idx, batch_size): + + # logging.info("Batch size is ignored for subgraphs") + + subgraph_mapping = self.subgraph_mapping + + train_loader = SubgraphLoader( + train_idx, subgraph_mapping, self._create_loader, batch_size, self.graph_model.g.ntypes) + val_loader = SubgraphLoader( + val_idx, subgraph_mapping, self._create_loader, batch_size, self.graph_model.g.ntypes) + test_loader = SubgraphLoader( + test_idx, subgraph_mapping, self._create_loader, batch_size, self.graph_model.g.ntypes) + + return train_loader, val_loader, test_loader + + def parameters(self, recurse: bool = True): + return chain(self.target_embedder.parameters(), self.link_predictor.parameters()) + + def custom_state_dict(self): + state_dict = OrderedDict() + for k, v in self.target_embedder.state_dict().items(): + state_dict[f"target_embedder.{k}"] = v + for k, v in self.link_predictor.state_dict().items(): + state_dict[f"link_predictor.{k}"] = v + return state_dict + + def custom_load_state_dict(self, state_dicts): + self.target_embedder.load_state_dict( + self.get_prefix("target_embedder", state_dicts) + ) + self.link_predictor.load_state_dict( + self.get_prefix("link_predictor", state_dicts) + ) + + def pooling_fn(self, node_embeddings): + # learnable vertor of weights + p = torch.nn.Parameter(torch.randn((100, 1))) + y = node_embeddings @ p / (torch.norm(p)) + + return torch.mean(y, dim=0, keepdim=True) + + def _graph_embeddings(self, input_nodes, blocks, train_embeddings=True, masked=None, subgraph_masks=None): + node_embs = super(SubgraphAbstractObjective, self)._graph_embeddings( + input_nodes, blocks, train_embeddings, masked + ) + + subgraph_embs = [] + for subgraph_mask in subgraph_masks: + subgraph_embs.append(self.pooling_fn(node_embs[subgraph_mask])) + return torch.transpose(torch.cat(subgraph_embs, 1), 0, 1) + + def forward(self, input_nodes, seeds, blocks, train_embeddings=True, neg_sampling_strategy=None): + subgraph_masks, seeds = seeds + masked = self.masker.get_mask(self.seeds_to_python( + seeds)) if self.masker is not None else None + graph_emb = self._graph_embeddings( + input_nodes, blocks, train_embeddings, masked=masked, subgraph_masks=subgraph_masks) + subgraph_embs_, element_embs_, labels = self.prepare_for_prediction( + graph_emb, seeds, self.target_embedding_fn, negative_factor=self.negative_factor, + neg_sampling_strategy=neg_sampling_strategy, + train_embeddings=train_embeddings + ) + acc, loss = self.compute_acc_loss( + subgraph_embs_, element_embs_, labels) + + return loss, acc + + def evaluate_objective(self, data_split, neg_sampling_strategy=None, negative_factor=1): + at = [1, 3, 5, 10] + count = 0 + + scores = defaultdict(list) + + for input_nodes, seeds, blocks in tqdm( + getattr(self, f"{data_split}_loader"), total=getattr(self, f"num_{data_split}_batches") + ): + blocks = [blk.to(self.device) for blk in blocks] + + subgraph_masks, seeds = seeds + + if self.masker is None: + masked = None + else: + masked = self.masker.get_mask(self.seeds_to_python(seeds)) + + src_embs = self._graph_embeddings( + input_nodes, blocks, masked=masked, subgraph_masks=subgraph_masks) + node_embs_, element_embs_, labels = self.prepare_for_prediction( + src_embs, seeds, self.target_embedding_fn, negative_factor=negative_factor, + neg_sampling_strategy=neg_sampling_strategy, + train_embeddings=False + ) + + if self.measure_scores: + if count % self.dilate_scores == 0: + scores_ = self.target_embedder.score_candidates(self.seeds_to_global(seeds), src_embs, + self.link_predictor, at=at, + type=self.link_predictor_type, device=self.device) + for key, val in scores_.items(): + scores[key].append(val) + + acc, loss = self.compute_acc_loss( + node_embs_, element_embs_, labels) + + scores["Loss"].append(loss.item()) + scores["Accuracy"].append(acc) + count += 1 + + scores = {key: sum_scores(val) for key, val in scores.items()} + return scores + + def verify_parameters(self): + pass + + +class SubgraphEmbeddingObjective(SubgraphAbstractObjective): + def __init__( + self, name, graph_model, node_embedder, nodes, data_loading_func, device, + sampling_neighbourhood_size, batch_size, + tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", + masker: Optional[SubwordMasker] = None, measure_scores=False, dilate_scores=1, + early_stopping=False, early_stopping_tolerance=20, nn_index="brute", + ns_groups=None, subgraph_mapping=None, subgraph_partition=None + ): + super(SubgraphEmbeddingObjective, self).__init__( + name, graph_model, node_embedder, nodes, data_loading_func, device, + sampling_neighbourhood_size, batch_size, + tokenizer_path, target_emb_size, link_predictor_type, + masker, measure_scores, dilate_scores, early_stopping, early_stopping_tolerance, nn_index, + ns_groups, subgraph_mapping, subgraph_partition + ) + + +class SubgraphClassifierObjective(NodeClassifierObjective, SubgraphAbstractObjective): + def __init__( + self, name, graph_model, node_embedder, nodes, data_loading_func, device, + sampling_neighbourhood_size, batch_size, + tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", + masker: Optional[SubwordMasker] = None, measure_scores=False, dilate_scores=1, + early_stopping=False, early_stopping_tolerance=20, nn_index="brute", + ns_groups=None, subgraph_mapping=None, subgraph_partition=None + ): + SubgraphAbstractObjective.__init__(self, + name, graph_model, node_embedder, nodes, data_loading_func, device, + sampling_neighbourhood_size, batch_size, + tokenizer_path, target_emb_size, link_predictor_type, + masker, measure_scores, dilate_scores, early_stopping, early_stopping_tolerance, nn_index, + ns_groups, subgraph_mapping, subgraph_partition + ) + + def create_target_embedder(self, data_loading_func, nodes, tokenizer_path): + self.target_embedder = SubgraphClassifierTargetMapper( + elements=data_loading_func() + ) + + def evaluate_objective(self, data_split, neg_sampling_strategy=None, negative_factor=1): + at = [1, 3, 5, 10] + count = 0 + scores = defaultdict(list) + + for input_nodes, seeds, blocks in getattr(self, f"{data_split}_loader"): + blocks = [blk.to(self.device) for blk in blocks] + + subgraph_masks, seeds = seeds + + if self.masker is None: + masked = None + else: + masked = self.masker.get_mask(self.seeds_to_python(seeds)) + + src_embs = self._graph_embeddings( + input_nodes, blocks, masked=masked, subgraph_masks=subgraph_masks) + node_embs_, element_embs_, labels = self.prepare_for_prediction( + src_embs, seeds, self.target_embedding_fn, negative_factor=negative_factor, + neg_sampling_strategy=neg_sampling_strategy, + train_embeddings=False + ) + # indices = self.seeds_to_global(seeds).tolist() + # labels = self.target_embedder[indices] + # labels = torch.LongTensor(labels).to(self.device) + acc, loss, logits = self.compute_acc_loss( + node_embs_, element_embs_, labels, return_logits=True) + + y_pred = nn.functional.softmax(logits, dim=-1).to("cpu").numpy() + y_true = np.zeros(y_pred.shape) + y_true[np.arange(0, y_true.shape[0]), + labels.to("cpu").numpy()] = 1. + + if self.measure_scores: + if count % self.dilate_scores == 0: + y_true_onehot = np.array(y_true) + labels = list(range(y_true_onehot.shape[1])) + + for k in at: + scores[f"ndcg@{k}"].append( + ndcg_score(y_true, y_pred, k=k)) + scores[f"acc@{k}"].append( + top_k_accuracy_score( + y_true_onehot.argmax(-1), y_pred, k=k, labels=labels) + ) + + scores["Loss"].append(loss.item()) + scores["Accuracy"].append(acc) + count += 1 + + if count == 0: + count += 1 + + scores = {key: sum_scores(val) for key, val in scores.items()} + return scores + + def parameters(self, recurse: bool = True): + return chain(self.classifier.parameters()) + + def custom_state_dict(self): + state_dict = OrderedDict() + for k, v in self.classifier.state_dict().items(): + state_dict[f"classifier.{k}"] = v + return state_dict + + +class SubgraphElementEmbedderBase(ElementEmbedderBase): + def __init__(self, elements, compact_dst=True): + # super(ElementEmbedderBase, self).__init__() + self.elements = elements.rename({"src": "id"}, axis=1) + self.init(compact_dst) + + def preprocess_element_data(self, *args, **kwargs): + pass + + def create_idx_pools(self, train_idx, val_idx, test_idx): + pool = set(self.elements["id"]) + train_pool, val_pool, test_pool = self._create_pools( + train_idx, val_idx, test_idx, pool) + return train_pool, val_pool, test_pool + + +# class SubgraphElementEmbedderWithSubwords(SubgraphElementEmbedderBase, ElementEmbedderWithBpeSubwords): +# def __init__(self, elements, emb_size, tokenizer_path, num_buckets=100000, max_len=10): +# self.tokenizer_path = tokenizer_path +# SubgraphElementEmbedderBase.__init__( +# self, elements=elements, compact_dst=False) +# nn.Module.__init__(self) +# Scorer.__init__(self, num_embs=len(self.elements["dst"].unique()), emb_size=emb_size, +# src2dst=self.element_lookup) + +# self.emb_size = emb_size +# self.init_subwords(elements, num_buckets=num_buckets, max_len=max_len) + + +class SubgraphClassifierTargetMapper(SubgraphElementEmbedderBase, Scorer): + def __init__(self, elements): + SubgraphElementEmbedderBase.__init__(self, elements=elements) + self.num_classes = len(self.inverse_dst_map) + + def set_embed(self, *args, **kwargs): + pass + + def prepare_index(self, *args): + pass diff --git a/SourceCodeTools/models/graph/train/objectives/SubgraphEmbedderObjective.py b/SourceCodeTools/models/graph/train/objectives/SubgraphEmbedderObjective.py new file mode 100644 index 00000000..cb7e5f7d --- /dev/null +++ b/SourceCodeTools/models/graph/train/objectives/SubgraphEmbedderObjective.py @@ -0,0 +1,157 @@ +from collections import OrderedDict +from itertools import chain +from typing import Optional + +import numpy as np +from torch import nn + +from SourceCodeTools.code.data.dataset.SubwordMasker import SubwordMasker +from SourceCodeTools.models.graph.ElementEmbedder import ElementEmbedderWithBpeSubwords +from SourceCodeTools.models.graph.train.Scorer import Scorer +from SourceCodeTools.models.graph.train.objectives.SubgraphClassifierObjective import SubgraphAbstractObjective, \ + SubgraphElementEmbedderBase +import random as rnd + + +class SubgraphEmbeddingObjective(SubgraphAbstractObjective): + def __init__( + self, name, graph_model, node_embedder, nodes, data_loading_func, device, + sampling_neighbourhood_size, batch_size, + tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", + masker: Optional[SubwordMasker] = None, measure_scores=False, dilate_scores=1, + early_stopping=False, early_stopping_tolerance=20, nn_index="brute", + ns_groups=None, subgraph_mapping=None, subgraph_partition=None + ): + super(SubgraphEmbeddingObjective, self).__init__( + name, graph_model, node_embedder, nodes, data_loading_func, device, + sampling_neighbourhood_size, batch_size, + tokenizer_path, target_emb_size, link_predictor_type, + masker, measure_scores, dilate_scores, early_stopping, early_stopping_tolerance, nn_index, + ns_groups, subgraph_mapping, subgraph_partition + ) + + def create_target_embedder(self, data_loading_func, nodes, tokenizer_path): + self.target_embedder = SubgraphElementEmbedderWithSubwords( + data_loading_func(), self.target_emb_size, tokenizer_path + ) + + +class SubgraphMatchingObjective(SubgraphAbstractObjective): + def __init__( + self, name, graph_model, node_embedder, nodes, data_loading_func, device, + sampling_neighbourhood_size, batch_size, + tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", + masker: Optional[SubwordMasker] = None, measure_scores=False, dilate_scores=1, + early_stopping=False, early_stopping_tolerance=20, nn_index="brute", + ns_groups=None, subgraph_mapping=None, subgraph_partition=None + ): + super(SubgraphMatchingObjective, self).__init__( + name, graph_model, node_embedder, nodes, data_loading_func, device, + sampling_neighbourhood_size, batch_size, + tokenizer_path, target_emb_size, link_predictor_type, + masker, measure_scores, dilate_scores, early_stopping, early_stopping_tolerance, nn_index, + ns_groups, subgraph_mapping, subgraph_partition + ) + + self.target_embedding_fn = self.get_targets_from_subgraph + self.update_embeddings_for_queries = True + + def create_target_embedder(self, data_loading_func, nodes, tokenizer_path): + self.target_embedder = SubgraphElementEmbedderMatcher( + data_loading_func(), self.target_emb_size, tokenizer_path + ) + + def get_targets_from_subgraph( + self, positive_indices, negative_indices=None, train_embeddings=True + ): + def get_embeddings(loader): + input_nodes, seeds, blocks = loader # next(loader) + subgraph_masks, seeds = seeds + masked = self.masker.get_mask(self.seeds_to_python(seeds)) if self.masker is not None else None + graph_emb = self._graph_embeddings(input_nodes, blocks, train_embeddings, masked=masked, + subgraph_masks=subgraph_masks) + return graph_emb + + def compactify(indices): + return list(set(indices)) + + def restore(indices, compact, embs): + mapping = {i: compact.index(i) for i in indices} + fancy = [mapping[i] for i in indices] + return embs[fancy] + + def prepare(indices): + indices_c = compactify(indices) + embs = get_embeddings(self.train_loader.load_ids(indices_c)) + if len(indices_c) != len(indices): + embs = restore(indices, indices_c, embs) + return embs + + positive_dst = prepare(positive_indices) + negative_dst = prepare(negative_indices) if negative_indices is not None else None + + return positive_dst, negative_dst + + def sample_negative(self, ids, k, neg_sampling_strategy): + negative = self.target_embedder.sample_negative( + k, ids=ids, + ) + return negative + + def parameters(self, recurse: bool = True): + return chain(self.link_predictor.parameters()) + + def custom_state_dict(self): + state_dict = OrderedDict() + for k, v in self.link_predictor.state_dict().items(): + state_dict[f"link_predictor.{k}"] = v + return state_dict + + +class SubgraphElementEmbedderWithSubwords(SubgraphElementEmbedderBase, ElementEmbedderWithBpeSubwords): + def __init__(self, elements, emb_size, tokenizer_path, num_buckets=100000, max_len=10): + self.tokenizer_path = tokenizer_path + SubgraphElementEmbedderBase.__init__(self, elements=elements, compact_dst=False) + nn.Module.__init__(self) + Scorer.__init__(self, num_embs=len(self.elements["dst"].unique()), emb_size=emb_size, + src2dst=self.element_lookup) + + self.emb_size = emb_size + self.init_subwords(elements, num_buckets=num_buckets, max_len=max_len) + + +class SubgraphElementEmbedderMatcher(SubgraphElementEmbedderBase, Scorer): + def __init__(self, elements, emb_size, tokenizer_path): + self.tokenizer_path = tokenizer_path + SubgraphElementEmbedderBase.__init__(self, elements=elements, compact_dst=False) + nn.Module.__init__(self) + Scorer.__init__(self, num_embs=len(self.elements["id"].unique()), emb_size=emb_size, + src2dst=self.element_lookup) + + def init(self, compact_dst): + self.id2name = dict(zip(self.elements["id"], self.elements["dst"])) + self.all_names = set(self.id2name.values()) + assert len(self.id2name) == len(self.elements) + + self.name2id = dict() + for id_, name in self.elements.values: + if name not in self.name2id: + self.name2id[name] = [] + self.name2id[name].append(id_) + + self.element_lookup = dict() + for id_ in self.id2name: + self.element_lookup[id_] = list(set(self.name2id[self.id2name[id_]]) - {id_}) + + self.elements["emb_id"] = self.elements["id"] + + def __getitem__(self, ids): + return np.fromiter((rnd.choice(list(set(self.name2id[self.id2name[id]]) - {id})) for id in ids), dtype=np.int32) + + def sample_negative(self, size, ids=None): + negative = [] + for id in ids: + rnd_name = rnd.choice(list(self.all_names - {self.id2name[id]})) + rnd_id = rnd.choice(list(self.name2id[rnd_name])) + negative.append(rnd_id) + return np.array(negative, dtype=np.int32) diff --git a/SourceCodeTools/models/graph/train/objectives/SubwordEmbedderObjective.py b/SourceCodeTools/models/graph/train/objectives/SubwordEmbedderObjective.py index ff83d6e0..86cc3782 100644 --- a/SourceCodeTools/models/graph/train/objectives/SubwordEmbedderObjective.py +++ b/SourceCodeTools/models/graph/train/objectives/SubwordEmbedderObjective.py @@ -1,7 +1,7 @@ from collections import OrderedDict from itertools import chain -from SourceCodeTools.code.data.sourcetrail.SubwordMasker import SubwordMasker +from SourceCodeTools.code.data.dataset.SubwordMasker import SubwordMasker from SourceCodeTools.models.graph.train.objectives.AbstractObjective import AbstractObjective @@ -10,14 +10,17 @@ def __init__( self, name, graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", masker: SubwordMasker = None, - measure_ndcg=False, dilate_ndcg=1 + measure_scores=False, dilate_scores=1, nn_index="brute" ): super(SubwordEmbedderObjective, self).__init__( name, graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=tokenizer_path, target_emb_size=target_emb_size, link_predictor_type=link_predictor_type, - masker=masker, measure_ndcg=measure_ndcg, dilate_ndcg=dilate_ndcg + masker=masker, measure_scores=measure_scores, dilate_scores=dilate_scores, nn_index=nn_index ) + self.target_embedding_fn = self.get_targets_from_embedder + self.negative_factor = 1 + self.update_embeddings_for_queries = False def verify_parameters(self): if self.link_predictor_type == "inner_prod": @@ -26,34 +29,6 @@ def verify_parameters(self): def create_target_embedder(self, data_loading_func, nodes, tokenizer_path): self.create_subword_embedder(data_loading_func, nodes, tokenizer_path) - def create_link_predictor(self): - if self.link_predictor_type == "nn": - self.create_nn_link_predictor() - elif self.link_predictor_type == "inner_prod": - self.create_inner_prod_link_predictor() - else: - raise NotImplementedError() - - def forward(self, input_nodes, seeds, blocks, train_embeddings=True): - masked = self.masker.get_mask(self.seeds_to_python(seeds)) if self.masker is not None else None - graph_emb = self._logits_batch(input_nodes, blocks, train_embeddings, masked=masked) - node_embs_, element_embs_, labels = self._logits_embedder( - graph_emb, self.target_embedder, self.link_predictor, seeds - ) - acc, loss = self.compute_acc_loss(node_embs_, element_embs_, labels) - - return loss, acc - - # def train(self): - # pass - - def evaluate(self, data_split, neg_sampling_factor=1): - loss, acc, ndcg = self._evaluate_embedder( - self.target_embedder, self.link_predictor, data_split=data_split, - neg_sampling_factor=neg_sampling_factor - ) - return loss, acc, ndcg - def parameters(self, recurse: bool = True): return chain(self.target_embedder.parameters(), self.link_predictor.parameters()) diff --git a/SourceCodeTools/models/graph/train/objectives/TextPredictionObjective.py b/SourceCodeTools/models/graph/train/objectives/TextPredictionObjective.py index 56ff5d8a..de941d73 100644 --- a/SourceCodeTools/models/graph/train/objectives/TextPredictionObjective.py +++ b/SourceCodeTools/models/graph/train/objectives/TextPredictionObjective.py @@ -1,19 +1,18 @@ -from collections import OrderedDict +from collections import OrderedDict, defaultdict from itertools import chain import datasets import torch -from torch import nn -from torch.nn import CrossEntropyLoss, NLLLoss +from torch.nn import CrossEntropyLoss -from SourceCodeTools.code.data.sourcetrail import SubwordMasker +from SourceCodeTools.code.data.dataset import SubwordMasker from SourceCodeTools.models.graph.ElementEmbedder import DocstringEmbedder, create_fixed_length, \ ElementEmbedderWithBpeSubwords from SourceCodeTools.models.graph.train.objectives import SubwordEmbedderObjective -from SourceCodeTools.models.graph.train.objectives.AbstractObjective import AbstractObjective, _compute_accuracy -from SourceCodeTools.models.nlp.Decoder import LSTMDecoder, Decoder +from SourceCodeTools.models.graph.train.objectives.AbstractObjective import compute_accuracy, \ + sum_scores +from SourceCodeTools.models.nlp.TorchDecoder import Decoder from SourceCodeTools.models.nlp.Vocabulary import Vocabulary -from SourceCodeTools.nlp.embed.bpe import load_bpe_model import numpy as np @@ -22,13 +21,13 @@ def __init__( self, graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", masker: SubwordMasker = None, - measure_ndcg=False, dilate_ndcg=1 + measure_scores=False, dilate_scores=1 ): super().__init__( "GraphTextPrediction", graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=tokenizer_path, target_emb_size=target_emb_size, link_predictor_type=link_predictor_type, - masker=masker, measure_ndcg=measure_ndcg, dilate_ndcg=dilate_ndcg + masker=masker, measure_scores=measure_scores, dilate_scores=dilate_scores ) def create_target_embedder(self, data_loading_func, nodes, tokenizer_path): @@ -43,14 +42,14 @@ def __init__( self, graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", masker: SubwordMasker = None, - measure_ndcg=False, dilate_ndcg=1, max_len=20 + measure_scores=False, dilate_scores=1, max_len=20 ): self.max_len = max_len + 2 # add pad and eos super().__init__( "GraphTextGeneration", graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=tokenizer_path, target_emb_size=target_emb_size, link_predictor_type=link_predictor_type, - masker=masker, measure_ndcg=measure_ndcg, dilate_ndcg=dilate_ndcg + masker=masker, measure_scores=measure_scores, dilate_scores=dilate_scores ) def create_target_embedder(self, data_loading_func, nodes, tokenizer_path): @@ -85,17 +84,21 @@ def compute_acc_loss(self, graph_emb, labels, lengths, return_logits=False): max_len = logits.shape[1] length_mask = torch.arange(max_len).to(self.device).expand(len(lengths), max_len) < lengths.unsqueeze(1) + logits_unrolled = logits[length_mask, :] + labels_unrolled = labels[length_mask] + loss_fct = CrossEntropyLoss(ignore_index=-100) + loss = loss_fct(logits_unrolled, labels_unrolled) # mask_ = length_mask.reshape(-1,) # loss_fct = NLLLoss(reduction="sum") - loss = loss_fct(logits.reshape(-1, logits.size(-1)),#[mask_, :], - labels.reshape(-1)) #[mask_]) + # loss = loss_fct(logits.reshape(-1, logits.size(-1)),#[mask_, :], + # labels.reshape(-1)) #[mask_]) def masked_accuracy(pred, true, mask): mask = mask.reshape(-1,) pred = pred.reshape(-1,)[mask] true = true.reshape(-1,)[mask] - return _compute_accuracy(pred, true) + return compute_accuracy(pred, true) acc = masked_accuracy(logits.argmax(dim=2), labels, length_mask) @@ -103,9 +106,8 @@ def masked_accuracy(pred, true, mask): return acc, loss, logits return acc, loss - - def forward(self, input_nodes, seeds, blocks, train_embeddings=True): - graph_emb = self._logits_batch(input_nodes, blocks, train_embeddings) + def forward(self, input_nodes, seeds, blocks, train_embeddings=True, neg_sampling_strategy=None): + graph_emb = self._graph_embeddings(input_nodes, blocks, train_embeddings) indices = self.seeds_to_global(seeds).tolist() labels, lengths = self.target_embedder[indices] labels = labels.to(self.device) @@ -129,16 +131,13 @@ def get_generated(self, tokens): return sents def evaluate_generation(self, data_split): - total_loss = 0 - total_acc = 0 - total_bleu = {f"bleu": 0.} - bleu_count = 0 + scores = defaultdict(list) count = 0 for input_nodes, seeds, blocks in getattr(self, f"{data_split}_loader"): blocks = [blk.to(self.device) for blk in blocks] - src_embs = self._logits_batch(input_nodes, blocks) + src_embs = self._graph_embeddings(input_nodes, blocks) indices = self.seeds_to_global(seeds).tolist() labels, lengths = self.target_embedder[indices] labels = labels.to(self.device) @@ -154,16 +153,19 @@ def evaluate_generation(self, data_split): # bleu = sacrebleu.corpus_bleu(pred, true) bleu = self.bleu_metric.compute(predictions=pred, references=[[t] for t in true]) - bleu_count += 1 - total_bleu["bleu"] += bleu['score'] + scores["bleu"].append(bleu['score']) - total_loss += loss.item() - total_acc += acc + scores["Loss"].append(loss.item()) + scores["Accuracy"].append(acc) count += 1 - return total_loss / count, total_acc / count, {"bleu": total_bleu["bleu"] / bleu_count} - def evaluate(self, data_split, neg_sampling_factor=1): + scores = {key: sum_scores(val) for key, val in scores.items()} + return scores + + def evaluate(self, data_split, *, neg_sampling_strategy=None, early_stopping=False, early_stopping_tolerance=20): loss, acc, bleu = self.evaluate_generation(data_split) + if data_split == "val": + self.check_early_stopping(acc) return loss, acc, bleu def parameters(self, recurse: bool = True): diff --git a/SourceCodeTools/models/graph/train/objectives/__init__.py b/SourceCodeTools/models/graph/train/objectives/__init__.py index 57b55612..9721cfa8 100644 --- a/SourceCodeTools/models/graph/train/objectives/__init__.py +++ b/SourceCodeTools/models/graph/train/objectives/__init__.py @@ -1,6 +1,8 @@ -from SourceCodeTools.code.data.sourcetrail.SubwordMasker import SubwordMasker -from SourceCodeTools.models.graph.train.objectives.GraphLinkObjective import GraphLinkObjective, GraphLinkTypeObjective -from SourceCodeTools.models.graph.train.objectives.NodeClassificationObjective import NodeNameClassifier +from SourceCodeTools.code.data.dataset.SubwordMasker import SubwordMasker +from SourceCodeTools.models.graph.train.objectives.GraphLinkClassificationObjective import \ + GraphLinkClassificationObjective +from SourceCodeTools.models.graph.train.objectives.GraphLinkObjective import GraphLinkObjective +from SourceCodeTools.models.graph.train.objectives.NodeClassificationObjective import NodeNameClassifier, NodeClassifierObjective from SourceCodeTools.models.graph.train.objectives.SubwordEmbedderObjective import SubwordEmbedderObjective from SourceCodeTools.models.graph.train.objectives.TextPredictionObjective import GraphTextPrediction, GraphTextGeneration @@ -10,13 +12,13 @@ def __init__( self, graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", masker: SubwordMasker = None, - measure_ndcg=False, dilate_ndcg=1 + measure_scores=False, dilate_scores=1 ): super(TokenNamePrediction, self).__init__( "TokenNamePrediction", graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=tokenizer_path, target_emb_size=target_emb_size, link_predictor_type=link_predictor_type, - masker=masker, measure_ndcg=measure_ndcg, dilate_ndcg=dilate_ndcg + masker=masker, measure_scores=measure_scores, dilate_scores=dilate_scores ) @@ -25,13 +27,29 @@ def __init__( self, graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", masker: SubwordMasker = None, - measure_ndcg=False, dilate_ndcg=1 + measure_scores=False, dilate_scores=1, nn_index="brute" ): super(NodeNamePrediction, self).__init__( "NodeNamePrediction", graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=tokenizer_path, target_emb_size=target_emb_size, link_predictor_type=link_predictor_type, - masker=masker, measure_ndcg=measure_ndcg, dilate_ndcg=dilate_ndcg + masker=masker, measure_scores=measure_scores, dilate_scores=dilate_scores, nn_index=nn_index + ) + + +class TypeAnnPrediction(NodeClassifierObjective): + def __init__( + self, graph_model, node_embedder, nodes, data_loading_func, device, + sampling_neighbourhood_size, batch_size, + tokenizer_path=None, target_emb_size=None, link_predictor_type=None, masker: SubwordMasker = None, + measure_scores=False, dilate_scores=1 + ): + super(TypeAnnPrediction, self).__init__( + "TypeAnnPrediction", graph_model, node_embedder, nodes, data_loading_func, device, + sampling_neighbourhood_size, batch_size, + # tokenizer_path=tokenizer_path, + target_emb_size=target_emb_size, link_predictor_type=link_predictor_type, + masker=masker, dilate_scores=dilate_scores ) @@ -40,13 +58,13 @@ def __init__( self, graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", masker: SubwordMasker = None, - measure_ndcg=False, dilate_ndcg=1 + measure_scores=False, dilate_scores=1 ): super(VariableNameUsePrediction, self).__init__( "VariableNamePrediction", graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=tokenizer_path, target_emb_size=target_emb_size, link_predictor_type=link_predictor_type, - masker=masker, measure_ndcg=measure_ndcg, dilate_ndcg=dilate_ndcg + masker=masker, measure_scores=measure_scores, dilate_scores=dilate_scores ) @@ -55,13 +73,13 @@ def __init__( self, graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", masker: SubwordMasker = None, - measure_ndcg=False, dilate_ndcg=1 + measure_scores=False, dilate_scores=1 ): super(NextCallPrediction, self).__init__( "NextCallPrediction", graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=tokenizer_path, target_emb_size=target_emb_size, link_predictor_type=link_predictor_type, - masker=masker, measure_ndcg=measure_ndcg, dilate_ndcg=dilate_ndcg + masker=masker, measure_scores=measure_scores, dilate_scores=dilate_scores ) @@ -70,26 +88,68 @@ def __init__( self, graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", masker: SubwordMasker = None, - measure_ndcg=False, dilate_ndcg=1 + measure_scores=False, dilate_scores=1 ): super(GlobalLinkPrediction, self).__init__( "GlobalLinkPrediction", graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=tokenizer_path, target_emb_size=target_emb_size, link_predictor_type=link_predictor_type, - masker=masker, measure_ndcg=measure_ndcg, dilate_ndcg=dilate_ndcg + masker=masker, measure_scores=measure_scores, dilate_scores=dilate_scores ) -class LinkTypePrediction(GraphLinkTypeObjective): +# class LinkTypePrediction(GraphLinkTypeObjective): +# def __init__( +# self, graph_model, node_embedder, nodes, data_loading_func, device, +# sampling_neighbourhood_size, batch_size, +# tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", masker: SubwordMasker = None, +# measure_scores=False, dilate_scores=1 +# ): +# super(GraphLinkTypeObjective, self).__init__( +# "LinkTypePrediction", graph_model, node_embedder, nodes, data_loading_func, device, +# sampling_neighbourhood_size, batch_size, +# tokenizer_path=tokenizer_path, target_emb_size=target_emb_size, link_predictor_type=link_predictor_type, +# masker=masker, measure_scores=measure_scores, dilate_scores=dilate_scores +# ) + + +class EdgePrediction(GraphLinkObjective): def __init__( self, graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", masker: SubwordMasker = None, - measure_ndcg=False, dilate_ndcg=1 + measure_scores=False, dilate_scores=1, nn_index="brute", ns_groups=None ): - super(GraphLinkTypeObjective, self).__init__( - "LinkTypePrediction", graph_model, node_embedder, nodes, data_loading_func, device, + super(EdgePrediction, self).__init__( + "EdgePrediction", graph_model, node_embedder, nodes, data_loading_func, device, sampling_neighbourhood_size, batch_size, tokenizer_path=tokenizer_path, target_emb_size=target_emb_size, link_predictor_type=link_predictor_type, - masker=masker, measure_ndcg=measure_ndcg, dilate_ndcg=dilate_ndcg + masker=masker, measure_scores=measure_scores, dilate_scores=dilate_scores, nn_index=nn_index, + ns_groups=ns_groups ) + # super(EdgePrediction, self).__init__( + # "LinkTypePrediction", graph_model, node_embedder, nodes, data_loading_func, device, + # sampling_neighbourhood_size, batch_size, + # tokenizer_path=tokenizer_path, target_emb_size=target_emb_size, link_predictor_type=link_predictor_type, + # masker=masker, measure_scores=measure_scores, dilate_scores=dilate_scores + # ) + +class EdgePrediction2(SubwordEmbedderObjective): + def __init__( + self, graph_model, node_embedder, nodes, data_loading_func, device, + sampling_neighbourhood_size, batch_size, + tokenizer_path=None, target_emb_size=None, link_predictor_type="inner_prod", masker: SubwordMasker = None, + measure_scores=False, dilate_scores=1 + ): + super(EdgePrediction2, self).__init__( + "EdgePrediction2", graph_model, node_embedder, nodes, data_loading_func, device, + sampling_neighbourhood_size, batch_size, + tokenizer_path=tokenizer_path, target_emb_size=target_emb_size, link_predictor_type=link_predictor_type, + masker=masker, measure_scores=measure_scores, dilate_scores=dilate_scores + ) + + def create_target_embedder(self, data_loading_func, nodes, tokenizer_path): + from SourceCodeTools.models.graph.ElementEmbedder import ElementEmbedder + self.target_embedder = ElementEmbedder( + elements=data_loading_func(), nodes=nodes, emb_size=self.target_emb_size, + ).to(self.device) \ No newline at end of file diff --git a/SourceCodeTools/models/graph/train/sampling_multitask2.py b/SourceCodeTools/models/graph/train/sampling_multitask2.py index 46c9d873..62fcd80c 100644 --- a/SourceCodeTools/models/graph/train/sampling_multitask2.py +++ b/SourceCodeTools/models/graph/train/sampling_multitask2.py @@ -1,7 +1,10 @@ import os +from collections import defaultdict from copy import copy +from pprint import pprint from typing import Tuple +import numpy as np import torch import torch.nn as nn from torch.utils.tensorboard import SummaryWriter @@ -10,68 +13,105 @@ from os.path import join import logging +from tqdm import tqdm + from SourceCodeTools.models.Embedder import Embedder from SourceCodeTools.models.graph.train.objectives import VariableNameUsePrediction, TokenNamePrediction, \ NextCallPrediction, NodeNamePrediction, GlobalLinkPrediction, GraphTextPrediction, GraphTextGeneration, \ - NodeNameClassifier + NodeNameClassifier, EdgePrediction, TypeAnnPrediction, EdgePrediction2, NodeClassifierObjective from SourceCodeTools.models.graph.NodeEmbedder import NodeEmbedder +from SourceCodeTools.models.graph.train.objectives.GraphLinkClassificationObjective import TransRObjective +from SourceCodeTools.models.graph.train.objectives.SCAAClassifierObjetive import SCAAClassifierObjective +from SourceCodeTools.models.graph.train.objectives.SubgraphClassifierObjective import SubgraphClassifierObjective +from SourceCodeTools.models.graph.train.objectives.SubgraphEmbedderObjective import SubgraphEmbeddingObjective, \ + SubgraphMatchingObjective + +class EarlyStopping(Exception): + def __init__(self, *args, **kwargs): + super(EarlyStopping, self).__init__(*args, **kwargs) -def _compute_accuracy(pred_, true_): - return torch.sum(pred_ == true_).item() / len(true_) + +def add_to_summary(summary, partition, objective_name, scores, postfix): + summary.update({ + f"{key}/{partition}/{objective_name}_{postfix}": val for key, val in scores.items() + }) class SamplingMultitaskTrainer: - def __init__(self, - dataset=None, model_name=None, model_params=None, - trainer_params=None, restore=None, device=None, - pretrained_embeddings_path=None, - tokenizer_path=None - ): + def __init__( + self, dataset=None, model_name=None, model_params=None, trainer_params=None, restore=None, device=None, + pretrained_embeddings_path=None, tokenizer_path=None, load_external_dataset=None + ): self.graph_model = model_name(dataset.g, **model_params).to(device) self.model_params = model_params self.trainer_params = trainer_params self.device = device self.epoch = 0 + self.restore_epoch = 0 self.batch = 0 self.dtype = torch.float32 + if load_external_dataset is not None: + logging.info("Loading external dataset") + external_args, external_dataset = load_external_dataset() + self.graph_model.g = external_dataset.g + dataset = external_dataset self.create_node_embedder( dataset, tokenizer_path, n_dims=model_params["h_dim"], - pretrained_path=pretrained_embeddings_path, n_buckets=trainer_params["embedding_table_size"] + pretrained_path=pretrained_embeddings_path, n_buckets=trainer_params[ + "embedding_table_size"] ) - self.summary_writer = SummaryWriter(self.model_base_path) - self.create_objectives(dataset, tokenizer_path) if restore: self.restore_from_checkpoint(self.model_base_path) - self.optimizer = self._create_optimizer() + if load_external_dataset is not None: + self.trainer_params["model_base_path"] = external_args.external_model_base + + self._create_optimizer() self.lr_scheduler = ExponentialLR(self.optimizer, gamma=1.0) # self.lr_scheduler = ReduceLROnPlateau(self.optimizer, patience=10, cooldown=20) + self.summary_writer = SummaryWriter(self.model_base_path) def create_objectives(self, dataset, tokenizer_path): + objective_list = self.trainer_params["objectives"] + self.objectives = nn.ModuleList() - if "token_pred" in self.trainer_params["objectives"]: + if "token_pred" in objective_list: self.create_token_pred_objective(dataset, tokenizer_path) - if "node_name_pred" in self.trainer_params["objectives"]: + if "node_name_pred" in objective_list: self.create_node_name_objective(dataset, tokenizer_path) - if "var_use_pred" in self.trainer_params["objectives"]: + if "var_use_pred" in objective_list: self.create_var_use_objective(dataset, tokenizer_path) - if "next_call_pred" in self.trainer_params["objectives"]: + if "next_call_pred" in objective_list: self.create_api_call_objective(dataset, tokenizer_path) - if "global_link_pred" in self.trainer_params["objectives"]: + if "global_link_pred" in objective_list: self.create_global_link_objective(dataset, tokenizer_path) - if "doc_pred" in self.trainer_params["objectives"]: + if "edge_pred" in objective_list: + self.create_edge_objective(dataset, tokenizer_path) + if "transr" in objective_list: + self.create_transr_objective(dataset, tokenizer_path) + if "doc_pred" in objective_list: self.create_text_prediction_objective(dataset, tokenizer_path) - if "doc_gen" in self.trainer_params["objectives"]: + if "doc_gen" in objective_list: self.create_text_generation_objective(dataset, tokenizer_path) - if "node_clf" in self.trainer_params["objectives"]: - self.create_node_name_classifier_objective(dataset, tokenizer_path) + if "node_clf" in objective_list: + self.create_node_classifier_objective(dataset, tokenizer_path) + if "type_ann_pred" in objective_list: + self.create_type_ann_objective(dataset, tokenizer_path) + if "subgraph_name_clf" in objective_list: + self.create_subgraph_name_objective(dataset, tokenizer_path) + if "subgraph_clf" in objective_list: + self.create_subgraph_classifier_objective(dataset, tokenizer_path) + if "subgraph_match" in objective_list: + self.create_subgraph_matching_objective(dataset, tokenizer_path) + if "scaa_match" in objective_list: + self.create_scaa_matching_objective(dataset, tokenizer_path) def create_token_pred_objective(self, dataset, tokenizer_path): self.objectives.append( @@ -80,21 +120,108 @@ def create_token_pred_objective(self, dataset, tokenizer_path): dataset.load_token_prediction, self.device, self.sampling_neighbourhood_size, self.batch_size, tokenizer_path=tokenizer_path, target_emb_size=self.elem_emb_size, link_predictor_type="inner_prod", - masker=dataset.create_subword_masker(), measure_ndcg=self.trainer_params["measure_ndcg"], - dilate_ndcg=self.trainer_params["dilate_ndcg"] + masker=dataset.create_subword_masker(), measure_scores=self.trainer_params["measure_scores"], + dilate_scores=self.trainer_params["dilate_scores"] ) ) def create_node_name_objective(self, dataset, tokenizer_path): self.objectives.append( + # GraphTextGeneration( + # self.graph_model, self.node_embedder, dataset.nodes, + # dataset.load_node_names, self.device, + # self.sampling_neighbourhood_size, self.batch_size, + # tokenizer_path=tokenizer_path, target_emb_size=self.elem_emb_size, + # ) NodeNamePrediction( self.graph_model, self.node_embedder, dataset.nodes, dataset.load_node_names, self.device, self.sampling_neighbourhood_size, self.batch_size, tokenizer_path=tokenizer_path, target_emb_size=self.elem_emb_size, link_predictor_type="inner_prod", masker=dataset.create_node_name_masker(tokenizer_path), - measure_ndcg=self.trainer_params["measure_ndcg"], - dilate_ndcg=self.trainer_params["dilate_ndcg"] + measure_scores=self.trainer_params["measure_scores"], + dilate_scores=self.trainer_params["dilate_scores"], nn_index=self.trainer_params["nn_index"] + ) + ) + + def _create_subgraph_objective( + self, *, objective_name, objective_class, dataset, labels_fn, tokenizer_path, subgraph_mapping=None, + subgraph_partition=None, + masker=None + ): + if subgraph_mapping is None: + subgraph_mapping = dataset.subgraph_mapping + + if subgraph_partition is None: + subgraph_partition = dataset.subgraph_partition + + return objective_class( + objective_name, + self.graph_model, self.node_embedder, dataset.nodes, + labels_fn, self.device, + self.sampling_neighbourhood_size, self.batch_size, + tokenizer_path=tokenizer_path, target_emb_size=self.elem_emb_size, link_predictor_type="inner_prod", + masker=masker, # dataset.create_node_name_masker(tokenizer_path), + measure_scores=self.trainer_params["measure_scores"], + dilate_scores=self.trainer_params["dilate_scores"], nn_index=self.trainer_params["nn_index"], + subgraph_mapping=subgraph_mapping, + subgraph_partition=subgraph_partition + ) + + def create_subgraph_name_objective(self, dataset, tokenizer_path): + self.objectives.append( + self._create_subgraph_objective( + objective_name="SubgraphNameEmbeddingObjective", + objective_class=SubgraphEmbeddingObjective, + dataset=dataset, + tokenizer_path=tokenizer_path, + labels_fn=dataset.load_subgraph_function_names, + ) + ) + + def create_subgraph_classifier_objective(self, dataset, tokenizer_path): + self.objectives.append( + self._create_subgraph_objective( + objective_name="SubgraphClassifierObjective", + objective_class=SubgraphClassifierObjective, + dataset=dataset, + tokenizer_path=tokenizer_path, + labels_fn=dataset.load_cubert_subgraph_labels, + ) + ) + + def create_subgraph_matching_objective(self, dataset, tokenizer_path): + self.objectives.append( + self._create_subgraph_objective( + objective_name="SubgraphMatchingObjective", + objective_class=SubgraphMatchingObjective, + dataset=dataset, + tokenizer_path=tokenizer_path, + labels_fn=dataset.load_scaa_subgraph_labels, + ) + ) + + def create_scaa_matching_objective(self, dataset, tokenizer_path): + self.objectives.append( + self._create_subgraph_objective( + objective_name="SCAAMatching", + objective_class=SCAAClassifierObjective, + dataset=dataset, + tokenizer_path=tokenizer_path, + labels_fn=dataset.load_scaa_subgraph_labels, + ) + ) + + def create_type_ann_objective(self, dataset, tokenizer_path): + self.objectives.append( + TypeAnnPrediction( + self.graph_model, self.node_embedder, dataset.nodes, + dataset.load_type_prediction, self.device, + self.sampling_neighbourhood_size, self.batch_size, + tokenizer_path=tokenizer_path, target_emb_size=self.elem_emb_size, link_predictor_type="inner_prod", + masker=None, + measure_scores=self.trainer_params["measure_scores"], + dilate_scores=self.trainer_params["dilate_scores"] ) ) @@ -106,8 +233,8 @@ def create_node_name_classifier_objective(self, dataset, tokenizer_path): self.sampling_neighbourhood_size, self.batch_size, tokenizer_path=tokenizer_path, target_emb_size=self.elem_emb_size, masker=dataset.create_node_name_masker(tokenizer_path), - measure_ndcg=self.trainer_params["measure_ndcg"], - dilate_ndcg=self.trainer_params["dilate_ndcg"] + measure_scores=self.trainer_params["measure_scores"], + dilate_scores=self.trainer_params["dilate_scores"] ) ) @@ -119,8 +246,8 @@ def create_var_use_objective(self, dataset, tokenizer_path): self.sampling_neighbourhood_size, self.batch_size, tokenizer_path=tokenizer_path, target_emb_size=self.elem_emb_size, link_predictor_type="inner_prod", masker=dataset.create_variable_name_masker(tokenizer_path), - measure_ndcg=self.trainer_params["measure_ndcg"], - dilate_ndcg=self.trainer_params["dilate_ndcg"] + measure_scores=self.trainer_params["measure_scores"], + dilate_scores=self.trainer_params["dilate_scores"] ) ) @@ -131,8 +258,8 @@ def create_api_call_objective(self, dataset, tokenizer_path): dataset.load_api_call, self.device, self.sampling_neighbourhood_size, self.batch_size, tokenizer_path=tokenizer_path, target_emb_size=self.elem_emb_size, link_predictor_type="inner_prod", - measure_ndcg=self.trainer_params["measure_ndcg"], - dilate_ndcg=self.trainer_params["dilate_ndcg"] + measure_scores=self.trainer_params["measure_scores"], + dilate_scores=self.trainer_params["dilate_scores"] ) ) @@ -145,8 +272,36 @@ def create_global_link_objective(self, dataset, tokenizer_path): dataset.load_global_edges_prediction, self.device, self.sampling_neighbourhood_size, self.batch_size, tokenizer_path=tokenizer_path, target_emb_size=self.elem_emb_size, link_predictor_type="nn", - measure_ndcg=self.trainer_params["measure_ndcg"], - dilate_ndcg=self.trainer_params["dilate_ndcg"] + measure_scores=self.trainer_params["measure_scores"], + dilate_scores=self.trainer_params["dilate_scores"] + ) + ) + + def create_edge_objective(self, dataset, tokenizer_path): + self.objectives.append( + EdgePrediction( + self.graph_model, self.node_embedder, dataset.nodes, + dataset.load_edge_prediction, self.device, + self.sampling_neighbourhood_size, self.batch_size, + tokenizer_path=tokenizer_path, target_emb_size=self.elem_emb_size, link_predictor_type=self.trainer_params[ + "metric"], + measure_scores=self.trainer_params["measure_scores"], + dilate_scores=self.trainer_params["dilate_scores"], nn_index=self.trainer_params["nn_index"], + # ns_groups=dataset.get_negative_sample_groups() + ) + ) + + def create_transr_objective(self, dataset, tokenizer_path): + self.objectives.append( + TransRObjective( + self.graph_model, self.node_embedder, dataset.nodes, + dataset.load_edge_prediction, self.device, + self.sampling_neighbourhood_size, self.batch_size, + tokenizer_path=tokenizer_path, target_emb_size=self.elem_emb_size, + link_predictor_type=self.trainer_params["metric"], + measure_scores=self.trainer_params["measure_scores"], + dilate_scores=self.trainer_params["dilate_scores"], + # ns_groups=dataset.get_negative_sample_groups() ) ) @@ -157,8 +312,8 @@ def create_text_prediction_objective(self, dataset, tokenizer_path): dataset.load_docstring, self.device, self.sampling_neighbourhood_size, self.batch_size, tokenizer_path=tokenizer_path, target_emb_size=self.elem_emb_size, link_predictor_type="inner_prod", - measure_ndcg=self.trainer_params["measure_ndcg"], - dilate_ndcg=self.trainer_params["dilate_ndcg"] + measure_scores=self.trainer_params["measure_scores"], + dilate_scores=self.trainer_params["dilate_scores"] ) ) @@ -172,6 +327,22 @@ def create_text_generation_objective(self, dataset, tokenizer_path): ) ) + def create_node_classifier_objective(self, dataset, tokenizer_path): + self.objectives.append( + NodeClassifierObjective( + "NodeTypeClassifier", + self.graph_model, self.node_embedder, dataset.nodes, + dataset.load_node_classes, self.device, + self.sampling_neighbourhood_size, self.batch_size, + tokenizer_path=tokenizer_path, target_emb_size=self.elem_emb_size, + masker=dataset.create_node_clf_masker(), + measure_scores=self.trainer_params["measure_scores"], + dilate_scores=self.trainer_params["dilate_scores"], + early_stopping=self.trainer_params["early_stopping"], + early_stopping_tolerance=self.trainer_params["early_stopping_tolerance"] + ) + ) + def create_node_embedder(self, dataset, tokenizer_path, n_dims=None, pretrained_path=None, n_buckets=500000): from SourceCodeTools.nlp.embed.fasttext import load_w2v_map @@ -181,7 +352,8 @@ def create_node_embedder(self, dataset, tokenizer_path, n_dims=None, pretrained_ pretrained = None if pretrained_path is None and n_dims is None: - raise ValueError(f"Specify embedding dimensionality or provide pretrained embeddings") + raise ValueError( + f"Specify embedding dimensionality or provide pretrained embeddings") elif pretrained_path is not None and n_dims is not None: assert n_dims == pretrained.n_dims, f"Requested embedding size and pretrained embedding " \ f"size should match: {n_dims} != {pretrained.n_dims}" @@ -197,14 +369,15 @@ def create_node_embedder(self, dataset, tokenizer_path, n_dims=None, pretrained_ emb_size=n_dims, # tokenizer_path=tokenizer_path, dtype=self.dtype, - pretrained=dataset.buckets_from_pretrained_embeddings(pretrained_path, n_buckets) + pretrained=dataset.buckets_from_pretrained_embeddings( + pretrained_path, n_buckets) if pretrained_path is not None else None, n_buckets=n_buckets ) @property def lr(self): - return self.trainer_params['lr'] + return self.trainer_params['learning_rate'] @property def batch_size(self): @@ -248,6 +421,10 @@ def finetune(self): return False return self.epoch >= self.trainer_params['pretraining_phase'] + @property + def subgraph_id_column(self): + return self.trainer_params["subgraph_id_column"] + @property def do_save(self): return self.trainer_params['save_checkpoints'] @@ -263,22 +440,45 @@ def write_hyperparams(self, scores, epoch): params = copy(self.model_params) params["epoch"] = epoch main_name = os.path.basename(self.model_base_path) - params = {k: v for k, v in params.items() if type(v) in {int, float, str, bool, torch.Tensor}} + params = {k: v for k, v in params.items() if type( + v) in {int, float, str, bool, torch.Tensor}} main_name = os.path.basename(self.model_base_path) scores = {f"h_metric/{k}": v for k, v in scores.items()} - self.summary_writer.add_hparams(params, scores, run_name=f"h_metric/{epoch}") + self.summary_writer.add_hparams( + params, scores, run_name=f"h_metric/{epoch}") def _create_optimizer(self): parameters = nn.ParameterList(self.graph_model.parameters()) - parameters.extend(self.node_embedder.parameters()) - [parameters.extend(objective.parameters()) for objective in self.objectives] + nodeembedder_params = list(self.node_embedder.parameters()) + # parameters.extend(self.node_embedder.parameters()) + [parameters.extend(objective.parameters()) + for objective in self.objectives] # AdaHessian TODO could not run # optimizer = Yogi(parameters, lr=self.lr) - optimizer = torch.optim.AdamW( - [{"params": parameters}], lr=self.lr + self.optimizer = torch.optim.AdamW( + [{"params": parameters}], lr=self.lr, weight_decay=0.5 ) - return optimizer + self.sparse_optimizer = torch.optim.SparseAdam( + [{"params": nodeembedder_params}], lr=self.lr + ) + + def compute_embeddings_for_scorer(self, objective): + if hasattr(objective.target_embedder, "scorer_all_keys") and objective.update_embeddings_for_queries: + def chunks(lst, n): + for i in range(0, len(lst), n): + yield torch.LongTensor(lst[i:i + n]) + + batches = chunks( + objective.target_embedder.scorer_all_keys, self.trainer_params["batch_size"]) + for batch in tqdm( + batches, + total=len( + objective.target_embedder.scorer_all_keys) // self.trainer_params["batch_size"] + 1, + desc="Precompute Target Embeddings", leave=True + ): + # scorer embedding updated inside + _ = objective.target_embedding_fn(batch) def train_all(self): """ @@ -287,57 +487,103 @@ def train_all(self): """ summary_dict = {} + best_val_loss = float("inf") + write_best_model = False for objective in self.objectives: with torch.set_grad_enabled(False): - objective.target_embedder.prepare_index() # need this to update sampler for the next epoch + self.compute_embeddings_for_scorer(objective) + # need this to update sampler for the next epoch + objective.target_embedder.prepare_index() for epoch in range(self.epoch, self.epochs): self.epoch = epoch start = time() - keep_training = True - summary_dict = {} - while keep_training: + num_batches = min( + [objective.num_train_batches for objective in self.objectives]) + + train_losses = defaultdict(list) + train_accs = defaultdict(list) + + # def append_metric(destination, name, metric): + # if name not in destination: + # destination[name] = [] + # destination[name].append(metric) + + for step in tqdm(range(num_batches), total=num_batches, desc=f"Epoch {self.epoch}"): loss_accum = 0 summary = {} try: - loaders = [objective.loader_next("train") for objective in self.objectives] + loaders = [objective.loader_next( + "train") for objective in self.objectives] except StopIteration: break self.optimizer.zero_grad() - for objective, (input_nodes, seeds, blocks) in zip(self.objectives, loaders): + self.sparse_optimizer.zero_grad() + for ind, (objective, (input_nodes, seeds, blocks)) in enumerate(zip(self.objectives, loaders)): blocks = [blk.to(self.device) for blk in blocks] - loss, acc = objective(input_nodes, seeds, blocks, train_embeddings=self.finetune) + objective.target_embedder.prepare_index() + + do_break = False + for block in blocks: + if block.num_edges() == 0: + do_break = True + if do_break: + break - loss = loss / len(self.objectives) # assumes the same batch size for all objectives + # try: + loss, acc = objective( + input_nodes, seeds, blocks, train_embeddings=self.finetune, + neg_sampling_strategy="w2v" if self.trainer_params["force_w2v_ns"] else None + ) + + # assumes the same batch size for all objectives + loss = loss / len(self.objectives) loss_accum += loss.item() # for groups in self.optimizer.param_groups: # for param in groups["params"]: # torch.nn.utils.clip_grad_norm_(param, max_norm=1.) loss.backward() # create_graph = True - summary.update({ - f"Loss/train/{objective.name}_vs_batch": loss.item(), - f"Accuracy/train/{objective.name}_vs_batch": acc, - }) + summary = {} + add_to_summary( + summary=summary, partition="train", objective_name=objective.name, + scores={"Loss": loss.item(), "Accuracy": acc}, postfix="" + ) + + train_losses[f"Loss/train_avg/{objective.name}"].append( + loss.item()) + train_accs[f"Accuracy/train_avg/{objective.name}"].append( + acc) + + # except ZeroEdges as e: + # logging.warning(f"Zero edges in loader in step {step}") + # except Exception as e: + # raise e self.optimizer.step() + self.sparse_optimizer.step() + step += 1 self.write_summary(summary, self.batch) summary_dict.update(summary) self.batch += 1 - summary = { - f"Loss/train": loss_accum, - } + # summary = { + # f"Loss/train": loss_accum, + # } + summary = {key: sum(val) / len(val) + for key, val in train_losses.items()} + summary.update({key: sum(val) / len(val) + for key, val in train_accs.items()}) self.write_summary(summary, self.batch) summary_dict.update(summary) @@ -348,22 +594,37 @@ def train_all(self): objective.eval() with torch.set_grad_enabled(False): - objective.target_embedder.prepare_index() # need this to update sampler for the next epoch - - val_loss, val_acc, val_ndcg = objective.evaluate("val") - test_loss, test_acc, test_ndcg = objective.evaluate("test") + # need this to update sampler for the next epoch + objective.target_embedder.prepare_index() + train_scores = objective.evaluate("train") + val_scores = objective.evaluate("val") + test_scores = objective.evaluate("test") - summary = { - f"Accuracy/test/{objective.name}_vs_batch": test_acc, - f"Accuracy/val/{objective.name}_vs_batch": val_acc, - } - if test_ndcg is not None: - summary.update({f"{key}/test/{objective.name}_vs_batch": val for key, val in test_ndcg.items()}) - if val_ndcg is not None: - summary.update({f"{key}/val/{objective.name}_vs_batch": val for key, val in val_ndcg.items()}) + summary = {} + add_to_summary(summary, "train", objective.name, + train_scores, postfix="") + add_to_summary(summary, "val", objective.name, + val_scores, postfix="") + add_to_summary(summary, "test", objective.name, + test_scores, postfix="") self.write_summary(summary, self.batch) summary_dict.update(summary) + # TODO: uncomment + # val_losses = [ + # item for key, item in summary_dict.items() if key.startswith("Loss/val")] + # avg_val_loss = sum(val_losses) / len(val_losses) + # if avg_val_loss < best_val_loss: + # best_val_loss = avg_val_loss + # write_best_model = True + + if self.do_save: + self.save_checkpoint( + self.model_base_path, write_best_model=write_best_model) + write_best_model = False + + if objective.early_stopping_trigger is True: + raise EarlyStopping() objective.train() @@ -371,17 +632,14 @@ def train_all(self): end = time() - print(f"Epoch: {self.epoch}, Time: {int(end - start)} s", end="\t") - print(summary_dict) - - if self.do_save: - self.save_checkpoint(self.model_base_path) + print(f"Epoch: {self.epoch}, Time: {int(end - start)} s", end="\n") + pprint(summary_dict) self.lr_scheduler.step() - def save_checkpoint(self, checkpoint_path=None, checkpoint_name=None, **kwargs): + def save_checkpoint(self, checkpoint_path=None, checkpoint_name=None, write_best_model=False, **kwargs): - checkpoint_path = join(checkpoint_path, "saved_state.pt") + model_path = join(checkpoint_path, f"saved_state.pt") param_dict = { 'graph_model': self.graph_model.state_dict(), @@ -396,47 +654,55 @@ def save_checkpoint(self, checkpoint_path=None, checkpoint_name=None, **kwargs): if len(kwargs) > 0: param_dict.update(kwargs) - torch.save(param_dict, checkpoint_path) + torch.save(param_dict, model_path) + if self.trainer_params["save_each_epoch"]: + torch.save(param_dict, join(checkpoint_path, + f"saved_state_{self.epoch}.pt")) + + if write_best_model: + torch.save(param_dict, join(checkpoint_path, f"best_model.pt")) def restore_from_checkpoint(self, checkpoint_path): - checkpoint = torch.load(join(checkpoint_path, "saved_state.pt")) + checkpoint = torch.load( + join(checkpoint_path, "saved_state.pt"), map_location=torch.device('cpu')) self.graph_model.load_state_dict(checkpoint['graph_model']) self.node_embedder.load_state_dict(checkpoint['node_embedder']) for objective in self.objectives: objective.custom_load_state_dict(checkpoint[objective.name]) self.epoch = checkpoint['epoch'] + self.restore_epoch = checkpoint['epoch'] self.batch = checkpoint['batch'] logging.info(f"Restored from epoch {checkpoint['epoch']}") # TODO needs test def final_evaluation(self): - + print('evaluation') summary_dict = {} for objective in self.objectives: objective.reset_iterator("train") objective.reset_iterator("val") objective.reset_iterator("test") + # objective.early_stopping = False + self.compute_embeddings_for_scorer(objective) + objective.target_embedder.prepare_index() + objective.update_embeddings_for_queries = False with torch.set_grad_enabled(False): for objective in self.objectives: + print('train') + train_scores = objective.evaluate("train") + val_scores = objective.evaluate("val") + test_scores = objective.evaluate("test") - train_loss, train_acc, train_ndcg = objective.evaluate("train") - val_loss, val_acc, val_ndcg = objective.evaluate("val") - test_loss, test_acc, test_ndcg = objective.evaluate("test") - - summary = { - f"Accuracy/train/{objective.name}_final": train_acc, - f"Accuracy/test/{objective.name}_final": test_acc, - f"Accuracy/val/{objective.name}_final": val_acc, - } - if train_ndcg is not None: - summary.update({f"{key}/train/{objective.name}_final": val for key, val in train_ndcg.items()}) - if val_ndcg is not None: - summary.update({f"{key}/val/{objective.name}_final": val for key, val in val_ndcg.items()}) - if test_ndcg is not None: - summary.update({f"{key}/test/{objective.name}_final": val for key, val in test_ndcg.items()}) + summary = {} + add_to_summary(summary, "train", objective.name, + train_scores, postfix="final") + add_to_summary(summary, "val", objective.name, + val_scores, postfix="final") + add_to_summary(summary, "test", objective.name, + test_scores, postfix="final") summary_dict.update(summary) @@ -466,15 +732,46 @@ def to(self, device): for objective in self.objectives: objective.to(device) + def compute_subgraph_embeddings(self, objective): + def chunks(lst, n): + for i in range(0, len(lst), n): + yield torch.LongTensor(lst[i:i + n]) + + batches = chunks(list(objective.subgraph_mapping.keys())[:-1], + self.trainer_params["batch_size"]) + + ids = [] + embs = [] + + for batch_ids in tqdm( + batches, + total=len( + objective.target_embedder.scorer_all_keys) // self.trainer_params["batch_size"] + 1, + desc="Computer Subgraph Embeddings", leave=True + ): + with torch.no_grad(): + embeddings, _ = objective.target_embedding_fn( + batch_ids, train_embeddings=False) + ids.append(batch_ids) + embs.append(embeddings) + + ids = torch.cat(ids, dim=0) + embeddings = torch.cat(embs, dim=0) + + return ids, embeddings + def get_embeddings(self): # self.graph_model.g.nodes["function"].data.keys() nodes = self.graph_model.g.nodes node_embs = { - ntype: self.node_embedder(node_type=ntype, node_ids=nodes[ntype].data['typed_id'], train_embeddings=False) + ntype: self.node_embedder( + node_type=ntype, node_ids=nodes[ntype].data['typed_id'], train_embeddings=False) for ntype in self.graph_model.g.ntypes } - h = self.graph_model.inference(batch_size=256, device='cpu', num_workers=0, x=node_embs) + logging.info("Computing all embeddings") + h = self.graph_model.inference( + batch_size=2048, device='cpu', num_workers=0, x=node_embs) original_id = [] global_id = [] @@ -491,62 +788,91 @@ def get_embeddings(self): def select_device(args): device = 'cpu' - use_cuda = args.gpu >= 0 and torch.cuda.is_available() + use_cuda = args["gpu"] >= 0 and torch.cuda.is_available() if use_cuda: - torch.cuda.set_device(args.gpu) - device = 'cuda:%d' % args.gpu + torch.cuda.set_device(args["gpu"]) + device = 'cuda:%d' % args["gpu"] return device -def training_procedure( - dataset, model_name, model_params, args, model_base_path -) -> Tuple[SamplingMultitaskTrainer, dict]: - - device = select_device(args) +def resolve_activation_function(function_name): + known_functions = { + "tanh": torch.tanh + } - model_params['num_classes'] = args.node_emb_size - model_params['use_gcn_checkpoint'] = args.use_gcn_checkpoint - model_params['use_att_checkpoint'] = args.use_att_checkpoint - model_params['use_gru_checkpoint'] = args.use_gru_checkpoint + return known_functions.get(function_name, eval(f"nn.functional.{function_name}")) - trainer_params = { - 'lr': model_params.pop('lr'), - 'batch_size': args.batch_size, - 'sampling_neighbourhood_size': args.num_per_neigh, - 'neg_sampling_factor': args.neg_sampling_factor, - 'epochs': args.epochs, - # 'node_name_file': args.fname_file, - # 'var_use_file': args.varuse_file, - # 'call_seq_file': args.call_seq_file, - 'elem_emb_size': args.elem_emb_size, - 'model_base_path': model_base_path, - 'pretraining_phase': args.pretraining_phase, - 'use_layer_scheduling': args.use_layer_scheduling, - 'schedule_layers_every': args.schedule_layers_every, - 'embedding_table_size': args.embedding_table_size, - 'save_checkpoints': args.save_checkpoints, - 'measure_ndcg': args.measure_ndcg, - 'dilate_ndcg': args.dilate_ndcg, - "objectives": args.objectives.split(",") - } - trainer = SamplingMultitaskTrainer( +def training_procedure( + dataset, model_name, model_params, trainer_params, model_base_path, + tokenizer_path=None, trainer=None, load_external_dataset=None +) -> Tuple[SamplingMultitaskTrainer, dict]: + model_params = copy(model_params) + trainer_params = copy(trainer_params) + + if trainer is None: + trainer = SamplingMultitaskTrainer + + device = select_device(trainer_params) + + # model_params['num_classes'] = args.node_emb_size + # model_params['use_gcn_checkpoint'] = args.use_gcn_checkpoint + # model_params['use_att_checkpoint'] = args.use_att_checkpoint + # model_params['use_gru_checkpoint'] = args.use_gru_checkpoint + + if len(trainer_params["objectives"].split(",")) > 1 and trainer_params["early_stopping"] is True: + print("Early stopping disabled when several objectives are used") + trainer_params["early_stopping"] = False + + trainer_params['model_base_path'] = model_base_path + + model_params["activation"] = resolve_activation_function( + model_params["activation"]) + + # trainer_params = { + # 'lr': model_params.pop('lr'), + # 'batch_size': args.batch_size, + # 'sampling_neighbourhood_size': args.num_per_neigh, + # 'neg_sampling_factor': args.neg_sampling_factor, + # 'epochs': args.epochs, + # 'elem_emb_size': args.elem_emb_size, + # 'model_base_path': model_base_path, + # 'pretraining_phase': args.pretraining_phase, + # 'use_layer_scheduling': args.use_layer_scheduling, + # 'schedule_layers_every': args.schedule_layers_every, + # 'embedding_table_size': args.embedding_table_size, + # 'save_checkpoints': args.save_checkpoints, + # 'measure_scores': args.measure_scores, + # 'dilate_scores': args.dilate_scores, + # "objectives": args.objectives.split(","), + # "early_stopping": args.early_stopping, + # "early_stopping_tolerance": args.early_stopping_tolerance, + # "force_w2v_ns": args.force_w2v_ns, + # "metric": args.metric, + # "nn_index": args.nn_index, + # "save_each_epoch": args.save_each_epoch + # } + + trainer = trainer( dataset=dataset, model_name=model_name, model_params=model_params, trainer_params=trainer_params, - restore=args.restore_state, + restore=trainer_params["restore_state"], device=device, - pretrained_embeddings_path=args.pretrained, - tokenizer_path=args.tokenizer + pretrained_embeddings_path=trainer_params["pretrained"], + tokenizer_path=tokenizer_path, + load_external_dataset=load_external_dataset ) - try: - trainer.train_all() - except KeyboardInterrupt: - print("Training interrupted") - except Exception as e: - raise e + # try: + trainer.train_all() + # except KeyboardInterrupt: + # logging.info("Training interrupted") + # except EarlyStopping: + # logging.info("Early stopping triggered") + # except Exception as e: + # print("There was an exception", e) trainer.eval() scores = trainer.final_evaluation() @@ -557,8 +883,12 @@ def training_procedure( def evaluation_procedure( - dataset, model_name, model_params, args, model_base_path -) -> Tuple[SamplingMultitaskTrainer, dict]: + dataset, model_name, model_params, args, model_base_path, trainer=None +): + model_params = copy(model_params) + + if trainer is None: + trainer = SamplingMultitaskTrainer device = select_device(args) @@ -567,7 +897,8 @@ def evaluation_procedure( model_params['use_att_checkpoint'] = args.use_att_checkpoint model_params['use_gru_checkpoint'] = args.use_gru_checkpoint - model_params["activation"] = eval(f"nn.functional.{model_params['activation']}") + model_params["activation"] = eval( + f"nn.functional.{model_params['activation']}") trainer_params = { 'lr': model_params.pop("lr"), @@ -575,9 +906,6 @@ def evaluation_procedure( 'sampling_neighbourhood_size': args.num_per_neigh, 'neg_sampling_factor': args.neg_sampling_factor, 'epochs': args.epochs, - # 'node_name_file': args.fname_file, - # 'var_use_file': args.varuse_file, - # 'call_seq_file': args.call_seq_file, 'elem_emb_size': args.elem_emb_size, 'model_base_path': model_base_path, 'pretraining_phase': args.pretraining_phase, @@ -585,11 +913,11 @@ def evaluation_procedure( 'schedule_layers_every': args.schedule_layers_every, 'embedding_table_size': args.embedding_table_size, 'save_checkpoints': args.save_checkpoints, - 'measure_ndcg': args.measure_ndcg, - 'dilate_ndcg': args.dilate_ndcg + 'measure_scores': args.measure_scores, + 'dilate_scores': args.dilate_scores } - trainer = SamplingMultitaskTrainer( + trainer = trainer( # SamplingMultitaskTrainer( dataset=dataset, model_name=model_name, model_params=model_params, diff --git a/SourceCodeTools/models/graph/train/test_rggan.py b/SourceCodeTools/models/graph/train/test_rggan.py new file mode 100644 index 00000000..ca804ddf --- /dev/null +++ b/SourceCodeTools/models/graph/train/test_rggan.py @@ -0,0 +1,537 @@ +import itertools +import logging +from copy import copy +from datetime import datetime +from os import mkdir +from os.path import isdir + +import pandas as pd + +from dgl.data import FB15kDataset, FB15k237Dataset, AMDataset +import torch +from sklearn.model_selection import ParameterGrid + +from SourceCodeTools.code.data.dataset.Dataset import SourceGraphDataset +from SourceCodeTools.models.graph import RGGAN +from SourceCodeTools.models.graph.NodeEmbedder import NodeIdEmbedder +from SourceCodeTools.models.graph.train.sampling_multitask2 import SamplingMultitaskTrainer +from SourceCodeTools.models.graph.train.utils import get_name, get_model_base +from SourceCodeTools.models.training_options import add_gnn_train_args, verify_arguments + +rggan_grids = [ + { + 'h_dim': [15], + 'num_bases': [-1], + 'num_steps': [5], + 'dropout': [0.0], + 'use_self_loop': [False], + 'activation': [torch.tanh], # torch.nn.functional.hardswish], #[torch.nn.functional.hardtanh], #torch.nn.functional.leaky_relu + 'lr': [1e-3], # 1e-4] + } +] + +rggan_params = list( + itertools.chain.from_iterable( + [ParameterGrid(p) for p in rggan_grids] + ) +) + + +class TestNodeClfGraph(SourceGraphDataset): + def __init__(self, data_loader, + label_from=None, use_node_types=True, + use_edge_types=True, filter=None, self_loops=False, + train_frac=0.6, random_seed=None, tokenizer_path=None, min_count_for_objectives=1, + no_global_edges=False, remove_reverse=False, package_names=None): + self.random_seed = random_seed + self.nodes_have_types = use_node_types + self.edges_have_types = use_edge_types + self.labels_from = label_from + # self.data_path = data_path + self.tokenizer_path = tokenizer_path + self.min_count_for_objectives = min_count_for_objectives + self.no_global_edges = no_global_edges + self.remove_reverse = remove_reverse + + # dataset = AIFBDataset() + # dataset = MUTAGDataset() + # dataset = BGSDataset() + # dataset = AMDataset() + dataset = data_loader() + + self.nodes, self.edges, self.typed_id_map = self.create_nodes_edges_df(dataset) + + # index is later used for sampling and is assumed to be unique + assert len(self.nodes) == len(self.nodes.index.unique()) + assert len(self.edges) == len(self.edges.index.unique()) + + if self_loops: + self.nodes, self.edges = SourceGraphDataset._assess_need_for_self_loops(self.nodes, self.edges) + + if filter is not None: + for e_type in filter.split(","): + logging.info(f"Filtering edge type {e_type}") + self.edges = self.edges.query(f"type != '{e_type}'") + + # if self.remove_reverse: + # self.remove_reverse_edges() + # + # if self.no_global_edges: + # self.remove_global_edges() + + if use_node_types is False and use_edge_types is False: + new_nodes, new_edges = self._create_nodetype_edges() + self.nodes = self.nodes.append(new_nodes, ignore_index=True) + self.edges = self.edges.append(new_edges, ignore_index=True) + + self.nodes['type_backup'] = self.nodes['type'] + if not self.nodes_have_types: + self.nodes['type'] = "node_" + self.nodes = self.nodes.astype({'type': 'category'}) + + self.add_embeddable_flag() + + # need to do this to avoid issues insode dgl library + self.edges['type'] = self.edges['type'].apply(lambda x: f"{x}_") + self.edges['type_backup'] = self.edges['type'] + if not self.edges_have_types: + self.edges['type'] = "edge_" + self.edges = self.edges.astype({'type': 'category'}) + + # compact labels + # self.nodes['label'] = self.nodes[label_from] + # self.nodes = self.nodes.astype({'label': 'category'}) + # self.label_map = compact_property(self.nodes['label']) + # assert any(pandas.isna(self.nodes['label'])) is False + + logging.info(f"Unique nodes: {len(self.nodes)}, node types: {len(self.nodes['type'].unique())}") + logging.info(f"Unique edges: {len(self.edges)}, edge types: {len(self.edges['type'].unique())}") + + # self.nodes, self.label_map = self.add_compact_labels() + self._add_typed_ids() + + # self.add_splits(train_frac=train_frac, package_names=package_names) + + # self.mark_leaf_nodes() + + self._create_hetero_graph() + + self._update_global_id() + + self.nodes.sort_values('global_graph_id', inplace=True) + + # self.splits = SourceGraphDataset.get_global_graph_id_splits(self.nodes) + + def create_nodes_edges_df(self, dataset): + graph = dataset[0] + nodes_df = None + + node_id_map = {} + typed_id_map = {} + + for ntype in graph.ntypes: + typed_id = graph.nodes(ntype=ntype).tolist() + type = [ntype] * len(typed_id) + name = list(map(lambda x: ntype+f"_{x}", typed_id)) + id_ = graph.nodes[ntype].data["_ID"].tolist() + node_dict = {"id": id_, "type": type, "name": name, "typed_id": typed_id} + + node_id_map[ntype] = dict(zip(typed_id, id_)) + typed_id_map[ntype] = dict(zip(id_, typed_id)) + + if "labels" in graph.nodes[ntype].data: + node_dict["labels"] = graph.nodes[ntype].data["labels"].tolist() + else: + node_dict["labels"] = [-1] * len(typed_id) + + if "train_mask" in graph.nodes[ntype].data: + node_dict["train_mask"] = graph.nodes[ntype].data["train_mask"].bool().tolist() + else: + node_dict["train_mask"] = [False] * len(typed_id) + + if "test_mask" in graph.nodes[ntype].data: + node_dict["test_mask"] = graph.nodes[ntype].data["test_mask"].bool().tolist() + else: + node_dict["test_mask"] = [False] * len(typed_id) + + if "val_mask" in graph.nodes[ntype].data: + node_dict["val_mask"] = graph.nodes[ntype].data["val_mask"].bool().tolist() + else: + node_dict["val_mask"] = [False] * len(typed_id) + + if nodes_df is None: + nodes_df = pd.DataFrame.from_dict(node_dict) + else: + nodes_df = nodes_df.append(pd.DataFrame.from_dict(node_dict)) + + assert len(nodes_df["id"]) == len(nodes_df["id"].unique()) # thus is a must have assert + + nodes_df = nodes_df.reset_index(drop=True) + # contiguous_node_index = dict(zip(nodes_df["index"], nodes_df.index)) + + edges_df = None + for srctype, etype, dsttype in graph.canonical_etypes: + src, dst = graph.edges(etype=(srctype, etype, dsttype)) + src = list(map(lambda x: node_id_map[srctype][x], src.tolist())) + dst = list(map(lambda x: node_id_map[dsttype][x], dst.tolist())) + + edge_data = { + "id": graph.edata["_ID"][(srctype, etype, dsttype)].tolist(), + "type": [etype] * len(src), + "src": src, + "dst": dst + } + + if edges_df is None: + edges_df = pd.DataFrame.from_dict(edge_data) + else: + edges_df = edges_df.append(pd.DataFrame.from_dict(edge_data)) + + # assert len(edges_df["id"]) == len(edges_df["id"].unique()) # fails with AMDataset + + edges_df = edges_df.reset_index(drop=True) + + return nodes_df, edges_df, typed_id_map + + def add_embeddable_flag(self): + self.nodes['embeddable'] = True + + def _add_typed_ids(self): + pass + + def load_node_classes(self): + labels = self.nodes.query("train_mask == True or test_mask == True or val_mask == True")[["id", "labels"]].rename({ + "id": "src", + "labels": "dst" + }, axis=1) + return labels + + +class TestLinkPredGraph(TestNodeClfGraph): + def __init__( + self, data_loader, label_from=None, use_node_types=True, + use_edge_types=True, filter=None, self_loops=False, + train_frac=0.6, random_seed=None, tokenizer_path=None, min_count_for_objectives=1, + no_global_edges=False, remove_reverse=False, package_names=None + ): + self.random_seed = random_seed + self.nodes_have_types = use_node_types + self.edges_have_types = use_edge_types + self.labels_from = label_from + # self.data_path = data_path + self.tokenizer_path = tokenizer_path + self.min_count_for_objectives = min_count_for_objectives + self.no_global_edges = no_global_edges + self.remove_reverse = remove_reverse + + dataset = data_loader() + + self.nodes, self.edges, self.typed_id_map = self.create_nodes_edges_df(dataset) + + # index is later used for sampling and is assumed to be unique + assert len(self.nodes) == len(self.nodes.index.unique()) + assert len(self.edges) == len(self.edges.index.unique()) + + if self_loops: + self.nodes, self.edges = SourceGraphDataset._assess_need_for_self_loops(self.nodes, self.edges) + self.nodes, self.val_edges = SourceGraphDataset._assess_need_for_self_loops(self.nodes, self.val_edges) + self.nodes, self.test_edges = SourceGraphDataset._assess_need_for_self_loops(self.nodes, self.test_edges) + + if filter is not None: + for e_type in filter.split(","): + logging.info(f"Filtering edge type {e_type}") + self.edges = self.edges.query(f"type != '{e_type}'") + self.val_edges = self.val_edges.query(f"type != '{e_type}'") + self.test_edges = self.test_edges.query(f"type != '{e_type}'") + + if use_node_types is False and use_edge_types is False: + new_nodes, new_edges = self._create_nodetype_edges() + self.nodes = self.nodes.append(new_nodes, ignore_index=True) + self.edges = self.edges.append(new_edges, ignore_index=True) + + self.nodes['type_backup'] = self.nodes['type'] + if not self.nodes_have_types: + self.nodes['type'] = "node_" + self.nodes = self.nodes.astype({'type': 'category'}) + + self.add_embeddable_flag() + + # need to do this to avoid issues insode dgl library + self.edges['type'] = self.edges['type'].apply(lambda x: f"{x}_") + self.edges['type_backup'] = self.edges['type'] + if not self.edges_have_types: + self.edges['type'] = "edge_" + self.edges = self.edges.astype({'type': 'category'}) + + # compact labels + # self.nodes['label'] = self.nodes[label_from] + # self.nodes = self.nodes.astype({'label': 'category'}) + # self.label_map = compact_property(self.nodes['label']) + # assert any(pandas.isna(self.nodes['label'])) is False + + logging.info(f"Unique nodes: {len(self.nodes)}, node types: {len(self.nodes['type'].unique())}") + logging.info(f"Unique edges: {len(self.edges)}, edge types: {len(self.edges['type'].unique())}") + + # self.nodes, self.label_map = self.add_compact_labels() + self._add_typed_ids() + + # self.add_splits(train_frac=train_frac, package_names=package_names) + + # self.mark_leaf_nodes() + + self._create_hetero_graph() + + self._update_global_id() + + self.nodes.sort_values('global_graph_id', inplace=True) + + # self.splits = SourceGraphDataset.get_global_graph_id_splits(self.nodes) + + def create_nodes_edges_df(self, dataset): + graph = dataset[0] + nodes_df = None + + node_id_map = {} + typed_id_map = {} + typed_id = [] + + id_ = graph.nodes().tolist() + type = graph.ndata["ntype"].tolist() + src, dst = graph.edges() + src = src.tolist() + dst = dst.tolist() + edge_type = graph.edata["etype"].tolist() + train_mask = graph.edata["train_mask"].tolist() + val_mask = graph.edata["val_mask"].tolist() + test_mask = graph.edata["test_mask"].tolist() + # node2train_mask = dict(zip(src, train_mask)) + # node2val_mask = dict(zip(src, val_mask)) + # node2test_mask = dict(zip(src, test_mask)) + + for nid_, ntype_ in zip(id_, type): + if ntype_ not in node_id_map: + node_id_map[ntype_] = dict() + typed_id_map[ntype_] = dict() + + node_id_map[ntype_][len(node_id_map[ntype_])] = nid_ + typed_id_map[ntype_][nid_] = len(typed_id_map[ntype_]) + typed_id.append(typed_id_map[ntype_][nid_]) + + name = list(map(lambda x: f"{x[0]}_{x[1]}", zip(type, typed_id))) + + node_dict = { + "id": id_, "type": type, "name": name, "typed_id": typed_id, + # "train_mask": list(map(lambda x: node2train_mask[x], id_)), + # "val_mask": list(map(lambda x: node2val_mask[x], id_)), + # "test_mask": list(map(lambda x: node2test_mask[x], id_)) + } + + nodes_df = pd.DataFrame.from_dict(node_dict) + + assert len(nodes_df["id"]) == len(nodes_df["id"].unique()) + + nodes_df = nodes_df.reset_index(drop=True) + # contiguous_node_index = dict(zip(nodes_df["index"], nodes_df.index)) + + assert len(node_id_map) == 1 + + edges_df = [] + for e_ind, (src, etype, dst) in enumerate(zip(src, edge_type, dst)): + edge_data = { + "id": e_ind, + "type": etype, + "src": src, + "dst": dst, + "train_mask": train_mask[e_ind], + "val_mask": val_mask[e_ind], + "test_mask": test_mask[e_ind] + } + edges_df.append(edge_data) + + edges_df = pd.DataFrame(edges_df) + + assert len(edges_df["id"]) == len(edges_df["id"].unique()) + + edges_df = edges_df.reset_index(drop=True) + self.train_edges = edges_df.query("train_mask == True") + self.val_edges = edges_df.query("train_mask == True or val_mask == True") + self.test_edges = edges_df + + return nodes_df, self.train_edges, typed_id_map + + +class TestTrainer(SamplingMultitaskTrainer): + def __init__(self, *args, **kwargs): + super(TestTrainer, self).__init__(*args, **kwargs) + + def create_node_embedder(self, dataset, tokenizer_path, n_dims=None, pretrained_path=None, n_buckets=500000): + self.node_embedder = NodeIdEmbedder( + nodes=dataset.nodes, + emb_size=n_dims, + dtype=self.dtype, + n_buckets=len(dataset.nodes) + 1 # override this because bucket size should be the same as number of nodes for this embedder + ) + + +def main_node_clf(models, args, data_loader): + for model, param_grid in models.items(): + for params in param_grid: + + if args.h_dim is None: + params["h_dim"] = args.node_emb_size + else: + params["h_dim"] = args.h_dim + + params["num_steps"] = args.n_layers + + date_time = str(datetime.now()) + print("\n\n") + print(date_time) + print(f"Model: {model.__name__}, Params: {params}") + + model_attempt = get_name(model, date_time) + + model_base = get_model_base(args, model_attempt) + + dataset = TestNodeClfGraph(data_loader=data_loader) + + args.objectives = "node_clf" + + from SourceCodeTools.models.graph.train.sampling_multitask2 import training_procedure + + trainer, scores = training_procedure(dataset, model, copy(params), args, model_base, trainer=TestTrainer) + + return scores + + +def main_link_pred(models, args, data_loader): + for model, param_grid in models.items(): + for params in param_grid: + + if args.h_dim is None: + params["h_dim"] = args.node_emb_size + else: + params["h_dim"] = args.h_dim + + params["num_steps"] = args.n_layers + + date_time = str(datetime.now()) + print("\n\n") + print(date_time) + print(f"Model: {model.__name__}, Params: {params}") + + model_attempt = get_name(model, date_time) + + model_base = get_model_base(args, model_attempt) + + dataset = TestLinkPredGraph(data_loader=data_loader) + + args.objectives = "link_pred" + + from SourceCodeTools.models.graph.train.sampling_multitask2 import training_procedure + + trainer, scores = training_procedure(dataset, model, copy(params), args, model_base, trainer=TestTrainer) + + return scores + +def format_data(dataset): + graph = dataset[0] + category = dataset.predict_category + train_mask = graph.nodes[category].data.pop('train_mask') + test_mask = graph.nodes[category].data.pop('test_mask') + labels = graph.nodes[category].data.pop('labels') + train_labels = labels[train_mask] + test_labels = labels[test_mask] + +def node_clf(args): + logging.basicConfig(level=logging.INFO, format="%(asctime)s:%(levelname)s:%(module)s:%(lineno)d:%(message)s") + + models_ = { + # GCNSampling: gcnsampling_params, + # GATSampler: gatsampling_params, + # RGCNSampling: rgcnsampling_params, + # RGAN: rgcnsampling_params, + RGGAN: rggan_params + + } + + if not isdir(args.model_output_dir): + mkdir(args.model_output_dir) + args.save_checkpoints = False + + # data_loaders = [AIFBDataset, MUTAGDataset, BGSDataset, AMDataset] + data_loaders = [AMDataset] + + for dl in data_loaders: + print(dl.__name__) + scores = main_node_clf(models_, args, data_loader=dl) + print("\t", scores) + + +def link_pred(args): + logging.basicConfig(level=logging.INFO, format="%(asctime)s:%(levelname)s:%(module)s:%(lineno)d:%(message)s") + + models_ = { + # GCNSampling: gcnsampling_params, + # GATSampler: gatsampling_params, + # RGCNSampling: rgcnsampling_params, + # RGAN: rgcnsampling_params, + RGGAN: rggan_params + + } + + if not isdir(args.model_output_dir): + mkdir(args.model_output_dir) + args.save_checkpoints = False + + data_loaders = [FB15kDataset, FB15k237Dataset] + + for dl in data_loaders: + print(dl.__name__) + scores = main_link_pred(models_, args, data_loader=dl) + print("\t", scores) + + + # dataset = AIFBDataset() + # graph = dataset[0] + # category = dataset.predict_category + # num_classes = dataset.num_classes + # train_mask = graph.nodes[category].data.pop('train_mask') + # test_mask = graph.nodes[category].data.pop('test_mask') + # labels = graph.nodes[category].data.pop('labels') + # + # print() + +# def main(): +# dataset = FB15k237Dataset() +# g = dataset.graph +# e_type = g.edata['e_type'] +# +# train_mask = g.edata['train_mask'] +# val_mask = g.edata['val_mask'] +# test_mask = g.edata['test_mask'] +# +# # graph = dataset[0] +# # train_mask = graph.edata['train_mask'] +# # test_mask = g.edata['test_mask'] +# +# train_set = torch.arange(g.number_of_edges())[train_mask] +# val_set = torch.arange(g.number_of_edges())[val_mask] +# +# train_idx = torch.nonzero(train_mask, as_tuple=False).squeeze() +# src, dst = graph.edges(train_idx) +# rel = graph.edata['etype'][train_idx] +# +# print() + +if __name__=="__main__": + import argparse + + parser = argparse.ArgumentParser(description='Process some integers.') + add_gnn_train_args(parser) + + args = parser.parse_args() + verify_arguments(args) + + node_clf(args) + # link_pred(args) \ No newline at end of file diff --git a/SourceCodeTools/models/graph/train/utils.py b/SourceCodeTools/models/graph/train/utils.py index 95d1a14c..cf07e3ee 100644 --- a/SourceCodeTools/models/graph/train/utils.py +++ b/SourceCodeTools/models/graph/train/utils.py @@ -18,11 +18,11 @@ def get_name(model, timestamp): return "{} {}".format(model.__name__, timestamp).replace(":", "-").replace(" ", "-").replace(".", "-") -def get_model_base(args, model_attempt): - if args.restore_state: - model_base = args.model_output_dir +def get_model_base(args, model_attempt, force_new=False): + if args["restore_state"] and not force_new: + model_base = args["model_output_dir"] else: - model_base = join(args.model_output_dir, model_attempt) + model_base = join(args["model_output_dir"], model_attempt) if not isdir(model_base): mkdir(model_base) diff --git a/SourceCodeTools/models/graph/utils/converters/convert_dglke_for_experiments.py b/SourceCodeTools/models/graph/utils/converters/convert_dglke_for_experiments.py index 0e157b34..d4844ec4 100644 --- a/SourceCodeTools/models/graph/utils/converters/convert_dglke_for_experiments.py +++ b/SourceCodeTools/models/graph/utils/converters/convert_dglke_for_experiments.py @@ -3,7 +3,7 @@ import os import pandas as pd import numpy as np -from SourceCodeTools.code.data.sourcetrail.Dataset import get_train_val_test_indices, load_data +from SourceCodeTools.code.data.dataset.Dataset import SourceGraphDataset, load_data # get_train_val_test_indices, load_data import pickle import argparse @@ -66,13 +66,13 @@ def load_npy(ent_map_path, npy_path): # splits = get_train_val_test_indices(nodes.index) from SourceCodeTools.code.data.sourcetrail.sourcetrail_types import node_types -splits = get_train_val_test_indices(nodes.query(f"type_backup == '{node_types[4096]}'").index) +splits = SourceGraphDataset.get_train_val_test_indices(nodes.query(f"type_backup == '{node_types[4096]}'").index) # nodes, edges, held = SourceGraphDataset.holdout(nodes, edges, 0.001) # nodes['label'] = nodes['type'] -from SourceCodeTools.code.data.sourcetrail.Dataset import create_train_val_test_masks +# from SourceCodeTools.code.data.dataset.Dataset import create_train_val_test_masks # def add_splits(nodes, splits): # nodes['train_mask'] = False # nodes.loc[nodes.index[splits[0]], 'train_mask'] = True @@ -94,7 +94,7 @@ def load_npy(ent_map_path, npy_path): os.path.join(args.out_path, "state_dict.pt") ) -create_train_val_test_masks(nodes, *splits) +SourceGraphDataset.create_train_val_test_masks(nodes, *splits) nodes.to_csv(os.path.join(args.out_path, "nodes.csv"), index=False) edges.to_csv(os.path.join(args.out_path, "edges.csv"), index=False) diff --git a/SourceCodeTools/models/graph/utils/converters/convert_w2v_for_experiments.py b/SourceCodeTools/models/graph/utils/converters/convert_w2v_for_experiments.py index 3e60e587..f59687bd 100644 --- a/SourceCodeTools/models/graph/utils/converters/convert_w2v_for_experiments.py +++ b/SourceCodeTools/models/graph/utils/converters/convert_w2v_for_experiments.py @@ -2,7 +2,7 @@ import torch import os, sys import numpy as np -from SourceCodeTools.code.data.sourcetrail.Dataset import get_train_val_test_indices, load_data +from SourceCodeTools.code.data.dataset.Dataset import SourceGraphDataset, load_data # get_train_val_test_indices, load_data import pickle @@ -43,7 +43,7 @@ def load_w2v(path): # splits = get_train_val_test_indices(nodes.index) from SourceCodeTools.code.data.sourcetrail.sourcetrail_types import node_types -splits = get_train_val_test_indices(nodes.query(f"type_backup == '{node_types[4096]}'").index) +splits = SourceGraphDataset.get_train_val_test_indices(nodes.query(f"type_backup == '{node_types[4096]}'").index) id_map, vecs = load_w2v(emb_path) @@ -65,8 +65,8 @@ def load_w2v(path): os.path.join(out_path, "state_dict.pt") ) -from SourceCodeTools.code.data.sourcetrail.Dataset import create_train_val_test_masks -create_train_val_test_masks(nodes, *splits) +# from SourceCodeTools.code.data.dataset.Dataset import create_train_val_test_masks +SourceGraphDataset.create_train_val_test_masks(nodes, *splits) nodes.to_csv(os.path.join(out_path, "nodes.csv"), index=False) edges.to_csv(os.path.join(out_path, "edges.csv"), index=False) diff --git a/SourceCodeTools/models/graph/utils/dglke_to_embedder.py b/SourceCodeTools/models/graph/utils/dglke_to_embedder.py new file mode 100644 index 00000000..093b1a00 --- /dev/null +++ b/SourceCodeTools/models/graph/utils/dglke_to_embedder.py @@ -0,0 +1,38 @@ +import pickle + +import numpy +import pandas as pd + +from SourceCodeTools.models.Embedder import Embedder + + +def read_entities(path): + return pd.read_csv(path, sep="\t", header=None)[1] + + +def read_vectors(path): + return numpy.load(path) + + +def main(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("entities") + parser.add_argument("vectors") + parser.add_argument("output") + + args = parser.parse_args() + + entities = read_entities(args.entities) + vectors = read_vectors(args.vectors) + + embedder = Embedder(dict(zip(entities, range(len(entities)))), vectors) + with open(args.output, "wb") as sink: + pickle.dump(embedder, sink) + + + + + +if __name__=="__main__": + main() \ No newline at end of file diff --git a/SourceCodeTools/models/graph/utils/export4visualization.py b/SourceCodeTools/models/graph/utils/export4visualization.py index 9533a5d1..69d28454 100644 --- a/SourceCodeTools/models/graph/utils/export4visualization.py +++ b/SourceCodeTools/models/graph/utils/export4visualization.py @@ -1,89 +1,130 @@ import pickle -import sys -import pandas +from os.path import join + import numpy as np import os from collections import Counter - -model_path = sys.argv[1] - -if len(sys.argv) > 2: - max_embs = int(sys.argv[2]) -else: - max_embs = 5000 - -nodes_path = os.path.join(model_path, "nodes.csv") -edges_path = os.path.join(model_path, "edges.csv") -embedders_path = os.path.join(model_path, "embeddings.pkl") - -embedders = pickle.load(open(embedders_path, "rb")) -nodes = pandas.read_csv(nodes_path) - -edges = pandas.read_csv(edges_path) -degrees = Counter(edges['src'].tolist()) + Counter(edges['dst'].tolist()) - -ids = nodes['id'].values -names = nodes['name']#.apply(lambda x: x.split(".")[-1]).values - -id_name_map = list(zip(ids, names)) -id_name_map_d = dict(id_name_map) - -ind_mapper = embedders[0].ind - -id_name_map = sorted(id_name_map, key=lambda x: ind_mapper[x[0]]) - -ids, names = zip(*id_name_map) - -print(f"Limiting to {max_embs} embeddings") - - -# emb0 = [] -# emb1 = [] -# emb2 = [] - -for group, gr_ in nodes.groupby("type_backup"): - c_ = 0 - - names = [] - embs = [[] for _ in embedders] - - nodes_in_group = set(gr_['id'].tolist()) - for ind, (id_, count) in enumerate(degrees.most_common()): - if id_ not in nodes_in_group: continue - names.append(id_name_map_d[id_]) - for emb_, embedder in zip(embs, embedders): - emb_.append(embedder.e[embedder.ind[id_]]) - # emb0.append(embedders[0].e[embedders[0].ind[id_]]) - # emb1.append(embedders[1].e[embedders[1].ind[id_]]) - # emb2.append(embedders[2].e[embedders[2].ind[id_]]) - c_ += 1 - if c_ >= max_embs - 1: break - # if ind >= max_embs-1: break - - # np.savetxt(os.path.join(model_path, "emb4proj_meta.tsv"), np.array(names).reshape(-1, 1)) - for ind, emb_ in enumerate(embs): - np.savetxt(os.path.join(model_path, f"emb4proj{ind}_{group}.tsv"), np.array(emb_), delimiter="\t") - # np.savetxt(os.path.join(model_path, "emb4proj0.tsv"), np.array(emb0), delimiter="\t") - # np.savetxt(os.path.join(model_path, "emb4proj1.tsv"), np.array(emb1), delimiter="\t") - # np.savetxt(os.path.join(model_path, "emb4proj2.tsv"), np.array(emb2), delimiter="\t") - - print("Writing meta...", end="") - with open(os.path.join(model_path, f"emb4proj_meta_{group}.tsv"), "w") as meta: - for name in names[:max_embs]: - meta.write(f"{name}\n") - print("done") - -# with open(os.path.join(model_path, "emb4proj2w2v.txt"), "w") as w2v: -# for ind, name in enumerate(names): -# w2v.write("%s " % name) -# for j, v in enumerate(emb2[ind]): -# if j < len(emb2[ind]) - 1: -# w2v.write("%f " % v) -# else: -# w2v.write("%f\n" % v) - - -# for ind, e in enumerate(embedders): -# print(f"Writing embedding layer {ind}...", end="") -# np.savetxt(os.path.join(model_path, f"emb4proj{ind}.tsv"), e.e[:max_embs], delimiter="\t") -# print("done") +import argparse + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("model_path") + parser.add_argument("output_path") + parser.add_argument("--max_embs", type=int, default=5000) + parser.add_argument("--into_groups", action="store_true") + args = parser.parse_args() + model_path = args.model_path + + dataset = pickle.load(open(join(args.model_path, "dataset.pkl"), "rb")) + embedders = pickle.load(open(join(args.model_path, "embeddings.pkl"), "rb")) + + nodes = dataset.nodes + edges = dataset.edges + + degrees = Counter(edges['src'].tolist()) + Counter(edges['dst'].tolist()) + + ids = nodes['id'].values + names = nodes['name']#.apply(lambda x: x.split(".")[-1]).values + + id_name_map = list(zip(ids, names)) + id_name_map_d = dict(id_name_map) + + ind_mapper = embedders[0].ind + + id_name_map = sorted(id_name_map, key=lambda x: ind_mapper[x[0]]) + + ids, names = zip(*id_name_map) + + print(f"Limiting to {args.max_embs} embeddings") + + # emb0 = [] + # emb1 = [] + # emb2 = [] + + def write_in_groups(): + + for group, gr_ in nodes.groupby("type_backup"): + c_ = 0 + + names = [] + embs = [[] for _ in embedders] + + nodes_in_group = set(gr_['id'].tolist()) + for ind, (id_, count) in enumerate(degrees.most_common()): + if id_ not in nodes_in_group: continue + names.append(id_name_map_d[id_]) + for emb_, embedder in zip(embs, embedders): + emb_.append(embedder.e[embedder.ind[id_]]) + # emb0.append(embedders[0].e[embedders[0].ind[id_]]) + # emb1.append(embedders[1].e[embedders[1].ind[id_]]) + # emb2.append(embedders[2].e[embedders[2].ind[id_]]) + c_ += 1 + if c_ >= args.max_embs - 1: break + # if ind >= max_embs-1: break + + # np.savetxt(os.path.join(model_path, "emb4proj_meta.tsv"), np.array(names).reshape(-1, 1)) + for ind, emb_ in enumerate(embs): + np.savetxt(os.path.join(args.output_path, f"emb4proj{ind}_{group}.tsv"), np.array(emb_), delimiter="\t") + # np.savetxt(os.path.join(model_path, "emb4proj0.tsv"), np.array(emb0), delimiter="\t") + # np.savetxt(os.path.join(model_path, "emb4proj1.tsv"), np.array(emb1), delimiter="\t") + # np.savetxt(os.path.join(model_path, "emb4proj2.tsv"), np.array(emb2), delimiter="\t") + + print("Writing meta...", end="") + with open(os.path.join(args.output_path, f"emb4proj_meta_{group}.tsv"), "w") as meta: + for name in names[:args.max_embs]: + meta.write(f"{name}\n") + print("done") + + def write_all_together(): + c_ = 0 + + names = [] + embs = [[] for _ in embedders] + + nodes_in_group = set(nodes['id'].tolist()) + for ind, (id_, count) in enumerate(degrees.most_common()): + if id_ not in nodes_in_group: continue + names.append(id_name_map_d[id_]) + for emb_, embedder in zip(embs, embedders): + emb_.append(embedder.e[embedder.ind[id_]]) + c_ += 1 + if c_ >= args.max_embs - 1: break + # if ind >= max_embs-1: break + + # np.savetxt(os.path.join(model_path, "emb4proj_meta.tsv"), np.array(names).reshape(-1, 1)) + for ind, emb_ in enumerate(embs): + np.savetxt(os.path.join(args.output_path, f"emb4proj{ind}.tsv"), np.array(emb_), delimiter="\t") + # np.savetxt(os.path.join(model_path, "emb4proj0.tsv"), np.array(emb0), delimiter="\t") + # np.savetxt(os.path.join(model_path, "emb4proj1.tsv"), np.array(emb1), delimiter="\t") + # np.savetxt(os.path.join(model_path, "emb4proj2.tsv"), np.array(emb2), delimiter="\t") + + print("Writing meta...", end="") + with open(os.path.join(args.output_path, f"emb4proj_meta.tsv"), "w") as meta: + for name in names[:args.max_embs]: + meta.write(f"{name}\n") + print("done") + + if args.into_groups: + write_in_groups() + else: + write_all_together() + + + # with open(os.path.join(model_path, "emb4proj2w2v.txt"), "w") as w2v: + # for ind, name in enumerate(names): + # w2v.write("%s " % name) + # for j, v in enumerate(emb2[ind]): + # if j < len(emb2[ind]) - 1: + # w2v.write("%f " % v) + # else: + # w2v.write("%f\n" % v) + + + # for ind, e in enumerate(embedders): + # print(f"Writing embedding layer {ind}...", end="") + # np.savetxt(os.path.join(model_path, f"emb4proj{ind}.tsv"), e.e[:max_embs], delimiter="\t") + # print("done") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/SourceCodeTools/models/graph/utils/import_into_neo4j.py b/SourceCodeTools/models/graph/utils/import_into_neo4j.py index bfc7571e..2b122be8 100644 --- a/SourceCodeTools/models/graph/utils/import_into_neo4j.py +++ b/SourceCodeTools/models/graph/utils/import_into_neo4j.py @@ -2,7 +2,7 @@ from neo4j import GraphDatabase import argparse -from SourceCodeTools.code.data.sourcetrail.Dataset import load_data +from SourceCodeTools.code.data.dataset.Dataset import load_data # from SourceCodeTools.data.sourcetrail.sourcetrail_types import node_types, edge_types diff --git a/SourceCodeTools/models/graph/utils/prepare_dglke_format.py b/SourceCodeTools/models/graph/utils/prepare_dglke_format.py index 3482f452..e039b249 100644 --- a/SourceCodeTools/models/graph/utils/prepare_dglke_format.py +++ b/SourceCodeTools/models/graph/utils/prepare_dglke_format.py @@ -1,79 +1,85 @@ +import logging import os -import pandas as pd -from SourceCodeTools.code.data.sourcetrail.Dataset import SourceGraphDataset, load_data, compact_property +from os.path import isdir, join, isfile + +from SourceCodeTools.code.data.dataset.Dataset import load_data, compact_property, SourceGraphDataset import argparse -parser = argparse.ArgumentParser(description='Process some integers.') -parser.add_argument('--nodes_path', dest='nodes_path', default=None, - help='Path to the file with nodes') -parser.add_argument('--edges_path', dest='edges_path', default=None, - help='Path to the file with edges') -parser.add_argument('--fname_path', dest='fname_path', default=None, - help='') -parser.add_argument('--varuse_path', dest='varuse_path', default=None, - help='') -parser.add_argument('--apicall_path', dest='apicall_path', default=None, - help='') -parser.add_argument('--out_path', dest='out_path', default=None, - help='') +from SourceCodeTools.code.data.file_utils import unpersist, persist + + +def get_paths(dataset_path, use_extra_objectives): + extra_objectives = ["node_names.bz2", "common_function_variable_pairs.bz2", "common_call_seq.bz2", "type_annotations.bz2"] + + largest_component = join(dataset_path, "largest_component") + if isdir(largest_component): + logging.info("Using graph from largest_component directory") + nodes_path = join(largest_component, "nodes.bz2") + edges_path = join(largest_component, "edges.bz2") + else: + nodes_path = join(dataset_path, "common_nodes.bz2") + edges_path = join(dataset_path, "common_edges.bz2") + + + extra_paths = list(map( + lambda file: file if use_extra_objectives and isfile(file) else None, + (join(dataset_path, objective) for objective in extra_objectives) + )) -args = parser.parse_args() + return nodes_path, edges_path, extra_paths -nodes, edges = load_data(args.nodes_path, args.edges_path) -node2graph_id = compact_property(nodes['id']) -nodes['global_graph_id'] = nodes['id'].apply(lambda x: node2graph_id[x]) -nodes, edges, held = SourceGraphDataset.holdout(nodes, edges, 0.005) -edges.to_csv(os.path.join(args.out_path, "edges_train.csv"), index=False) -held.to_csv(os.path.join(args.out_path, "held.csv"), index=False) +def filter_relevant(data, node_ids): + return data.query("src in @allowed", local_dict={"allowed": node_ids}) -edges = edges.astype({"src": 'str', "dst": "str", "type": 'str'})[['src', 'dst', 'type']] -node_ids = set(nodes['id'].unique()) +def main(): + parser = argparse.ArgumentParser(description='Process some integers.') + parser.add_argument('dataset_path', default=None, help='Path to the dataset') + parser.add_argument('output_path', default=None, help='') + parser.add_argument("--extra_objectives", action="store_true", default=False) + parser.add_argument("--eval_frac", dest="eval_frac", default=0.05, type=float) -if args.fname_path is not None: - fname = pd.read_csv(args.fname_path).astype({"src": 'int32', "dst": "str"}) - fname['type'] = 'fname' + args = parser.parse_args() - fname = fname[ - fname['src'].apply(lambda x: x in node_ids) - ] + nodes_path, edges_path, extra_paths = get_paths( + args.dataset_path, use_extra_objectives=args.extra_objectives + ) - edges = pd.concat([edges, fname]) + nodes, edges = load_data(nodes_path, edges_path) + nodes, edges, holdout = SourceGraphDataset.holdout(nodes, edges) + edges = edges.astype({"src": 'str', "dst": "str", "type": 'str'})[['src', 'dst', 'type']] + holdout = holdout.astype({"src": 'str', "dst": "str", "type": 'str'})[['src', 'dst', 'type']] -if args.varuse_path is not None: - varuse = pd.read_csv(args.varuse_path).astype({"src": 'int32', "dst": "str"}) - varuse['type'] = 'varuse' + node2graph_id = compact_property(nodes['id']) + nodes['global_graph_id'] = nodes['id'].apply(lambda x: node2graph_id[x]) - varuse = varuse[ - varuse['src'].apply(lambda x: x in node_ids) - ] + node_ids = set(nodes['id'].unique()) - edges = pd.concat([edges, varuse]) + if args.extra_objectives: + for objective_path in extra_paths: + data = unpersist(objective_path) + data = filter_relevant(data, node_ids) + data["type"] = objective_path.split(".")[0] + edges = edges.append(data) -if args.apicall_path is not None: - apicall = pd.read_csv(args.apicall_path).astype({"src": 'int32', "dst": "int32"}) - apicall['type'] = 'nextcall' + if not os.path.isdir(args.output_path): + os.mkdir(args.output_path) - apicall = apicall[ - apicall['src'].apply(lambda x: x in node_ids) - ] + edges = edges[['src','dst','type']] + eval_sample = edges.sample(frac=args.eval_frac) - edges = pd.concat([edges, apicall]) + persist(nodes, join(args.output_path, "nodes_dglke.csv")) + persist(edges, join(args.output_path, "edges_train_dglke.tsv"), header=False, sep="\t") + persist(edges, join(args.output_path, "edges_train_node2vec.tsv"), header=False, sep=" ") + persist(eval_sample, join(args.output_path, "edges_eval_dglke.tsv"), header=False, sep="\t") + persist(eval_sample, join(args.output_path, "edges_eval_node2vec.tsv"), header=False, sep=" ") + persist(holdout, join(args.output_path, "edges_eval_dglke_10000.tsv"), header=False, sep="\t") + persist(holdout, join(args.output_path, "edges_eval_node2vec_10000.tsv"), header=False, sep=" ") -# splits = get_train_test_val_indices(edges.index, train_frac=0.6) -nodes['label'] = nodes['type'] -if not os.path.isdir(args.out_path): - os.mkdir(args.out_path) -nodes.to_csv(os.path.join(args.out_path, "nodes.csv"), index=False) -edges.to_csv(os.path.join(args.out_path, "edges_train_dglke.tsv"), index=False, header=False, sep="\t") -edges[['src','dst']].to_csv(os.path.join(args.out_path, "edges_train_node2vec.csv"), index=False, header=False, sep=" ") -# edges.iloc[splits[0]].to_csv(os.path.join(args.out_path, "edges_train.csv"), index=False, header=False, sep="\t") -# edges.iloc[splits[1]].to_csv(os.path.join(args.out_path, "edges_val.csv"), index=False, header=False, sep="\t") -# edges.iloc[splits[2]].to_csv(os.path.join(args.out_path, "edges_test.csv"), index=False, header=False, sep="\t") -held[['src','dst','type']].to_csv(os.path.join(args.out_path, "held_dglkg.tsv"), index=False, sep='\t', header=False) -held[['src','dst']].to_csv(os.path.join(args.out_path, "held_node2vec.csv"), index=False, sep=' ', header=False) \ No newline at end of file +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/SourceCodeTools/models/graph/utils/prepare_dglke_format2.py b/SourceCodeTools/models/graph/utils/prepare_dglke_format2.py new file mode 100644 index 00000000..72b985c5 --- /dev/null +++ b/SourceCodeTools/models/graph/utils/prepare_dglke_format2.py @@ -0,0 +1,270 @@ +import logging +import os +from collections import defaultdict +from os.path import isdir, join, isfile +from random import random + +import numpy as np +import pandas as pd + +from SourceCodeTools.code.common import read_edges, read_nodes +from SourceCodeTools.code.data.dataset.Dataset import load_data, compact_property, SourceGraphDataset + +import argparse + +from SourceCodeTools.code.data.file_utils import unpersist, persist +from SourceCodeTools.code.data.sourcetrail.sourcetrail_types import node_types + + +def get_paths(dataset_path, use_extra_objectives): + extra_objectives = ["node_names.json", "common_function_variable_pairs.json", "common_call_seq.json", "type_annotations.json"] + + largest_component = join(dataset_path, "largest_component") + if isdir(largest_component): + logging.info("Using graph from largest_component directory") + nodes_path = join(largest_component, "common_nodes.json") + edges_path = join(largest_component, "common_edges.json") + else: + nodes_path = join(dataset_path, "common_nodes.json") + edges_path = join(dataset_path, "common_edges.json") + + extra_paths = list(filter(lambda x: x is not None, map( + lambda file: file if use_extra_objectives and isfile(file) else None, + (join(dataset_path, objective) for objective in extra_objectives) + ))) + + return nodes_path, edges_path, extra_paths + + +def filter_relevant(data, node_ids): + return data.query("src in @allowed", local_dict={"allowed": node_ids}) + + +def add_counts(counter, node_ids): + for node_id in node_ids: + counter[node_id] += 1 + + +def count_degrees(edges_path): + counter = defaultdict(lambda: 0) + for edges in read_edges(edges_path, as_chunks=True): + add_counts(counter, edges["source_node_id"]) + add_counts(counter, edges["target_node_id"]) + + return counter + + +def count_with_occurrence(counter, min_occurrence): + c = 0 + for id_, count in counter.items(): + if count > min_occurrence: + c += 1 + return c + + +def get_writing_mode(is_csv, first_written): + kwargs = {} + if first_written is True: + kwargs["mode"] = "a" + if is_csv: + kwargs["header"] = False + return kwargs + + +def do_holdout(edges_path, output_path, node_descriptions, holdout_size=10000, min_count=2): + + counter = count_degrees(edges_path) + num_valid_candidates = count_with_occurrence(counter, min_count) + + frac = holdout_size / num_valid_candidates + + # temp_edges = join(os.path.dirname(edges_path), "temp_" + os.path.basename(edges_path)) + out_edges_path = join(output_path, "edges_train_dglke.tsv") + out_holdout_path = join(output_path, "edges_eval_dglke_10000.tsv") + is_csv = True + + first_edges = False + first_holdout = False + + total_edges = 0 + total_holdout = 0 + + seen = set() + + for edges in read_edges(edges_path, as_chunks=True): + edges.rename({"source_node_id": "src", "target_node_id": "dst"}, axis=1, inplace=True) + edges = edges[['src', 'dst', 'type']] + + sufficient_count = edges["src"].apply(lambda x: counter[x] > min_count) & \ + edges["dst"].apply(lambda x: counter[x] > min_count) + + definitely_keep = edges[~sufficient_count] + probably_keep = edges[sufficient_count] + + probably_holdout_mask = np.array([random() < frac for _ in range(len(probably_keep))]) + + probably_holdout = probably_keep[probably_holdout_mask] + + definitely_holdout_mask = [] + for src, dst, type_ in probably_holdout.values: + if counter[src] > min_count and counter[dst] > min_count: + definitely_holdout_mask.append(True) + counter[src] -= 1 + counter[dst] -= 1 + else: + definitely_holdout_mask.append(False) + + definitely_holdout_mask = np.array(definitely_holdout_mask) + + definitely_holdout = probably_holdout[definitely_holdout_mask] + definitely_keep = pd.concat([definitely_keep, probably_keep[~probably_holdout_mask], probably_holdout[~definitely_holdout_mask]]) + + total_edges += len(definitely_keep) + total_holdout += len(definitely_holdout) + + def apply_description(edges): + edges["src"] = edges["src"].apply(node_descriptions.get) + edges["dst"] = edges["dst"].apply(node_descriptions.get) + return edges + + def write_filtered(table, path, first_written): + with_description = apply_description(table) + + with_description.drop_duplicates(inplace=True) + + reprs = [(src, dst, type_) for src, dst, type_ in with_description.values] + + seen_mask = np.array(list(map(lambda x: x in seen, reprs))) + + with_description = with_description.loc[~seen_mask] + seen.update(reprs) + + kwargs = get_writing_mode(is_csv, first_written) + persist(with_description, path, sep="\t", **kwargs) + + write_filtered(definitely_keep, out_edges_path, first_edges) + first_edges = True + + if len(definitely_holdout) > 0: + write_filtered(definitely_holdout, out_holdout_path, first_holdout) + first_holdout = True + + return counter, total_edges, total_holdout + + +def add_extra_objectives(extra_paths, output_path, node_ids): + out_edges_path = join(output_path, "edges_train_dglke.tsv") + + total_extra = 0 + + for objective_path in extra_paths: + data = unpersist(objective_path) + data = filter_relevant(data, node_ids) + data["type"] = data["type"].split(".")[0] + data = data[["src", "dst", "type"]] + raise NotImplementedError() + kwargs = get_writing_mode(is_csv=True, first_written=True) + persist(data, out_edges_path, sep="\t", **kwargs) # write_filtered + + total_extra += len(data) + + return total_extra + + +def save_eval(output_dir, eval_frac): + eval_path = join(output_dir, "edges_eval_dglke.tsv") + + total_eval = 0 + + for ind, edges in enumerate(read_edges(join(output_dir, "edges_train_dglke.tsv"), as_chunks=True)): + eval = edges.sample(frac=eval_frac) + if len(eval) > 0: + kwargs = get_writing_mode(is_csv=True, first_written=ind != 0) + persist(eval, eval_path, sep="\t", **kwargs) + + total_eval += len(eval) + + return total_eval + + +def get_node_descriptions(nodes_path, distinct_node_types): + + description = {} + + for nodes in read_nodes(nodes_path, as_chunks=True): + transform_mask = nodes.eval("type in @distinct_node_types", local_dict={"distinct_node_types": distinct_node_types}) + + nodes.loc[transform_mask, "transformed"] = nodes.loc[transform_mask, "id"].astype("string") + nodes.loc[~transform_mask, "transformed"] = nodes.loc[~transform_mask, "type"] + + for id, desc in nodes[["id", "transformed"]].values: + description[id] = desc + + return description + + +def main(): + parser = argparse.ArgumentParser(description='Process some integers.') + parser.add_argument('dataset_path', default=None, help='Path to the dataset') + parser.add_argument('output_path', default=None, help='') + parser.add_argument("--extra_objectives", action="store_true", default=False) + parser.add_argument("--eval_frac", dest="eval_frac", default=0.05, type=float) + + distinct_node_types = set(node_types.values()) | { + "FunctionDef", "mention", "Op", "#attr#", "#keyword#", "subword" + } + + args = parser.parse_args() + + if not os.path.isdir(args.output_path): + os.mkdir(args.output_path) + + nodes_path, edges_path, extra_paths = get_paths( + args.dataset_path, use_extra_objectives=args.extra_objectives + ) + + node_descriptions = get_node_descriptions(nodes_path, distinct_node_types) + + counter, total_edges, total_holdout = do_holdout(edges_path, args.output_path, node_descriptions) + + total_extra = add_extra_objectives(extra_paths, args.output_path, set(counter.keys())) + + temp_edges = join(args.output_path, "temp_common_edges.tsv") + + total_eval = save_eval(args.output_path, args.eval_frac) + + # nodes, edges = load_data(nodes_path, edges_path) + # nodes, edges, holdout = SourceGraphDataset.holdout(nodes, edges) + # edges = edges.astype({"src": 'str', "dst": "str", "type": 'str'})[['src', 'dst', 'type']] + # holdout = holdout.astype({"src": 'str', "dst": "str", "type": 'str'})[['src', 'dst', 'type']] + # + # node2graph_id = compact_property(nodes['id']) + # nodes['global_graph_id'] = nodes['id'].apply(lambda x: node2graph_id[x]) + # + # node_ids = set(nodes['id'].unique()) + # + # if args.extra_objectives: + # for objective_path in extra_paths: + # data = unpersist(objective_path) + # data = filter_relevant(data, node_ids) + # data["type"] = objective_path.split(".")[0] + # edges = edges.append(data) + + + + # edges = edges[['src','dst','type']] + # eval_sample = edges.sample(frac=args.eval_frac) + # + # persist(nodes, join(args.output_path, "nodes_dglke.csv")) + # persist(edges, join(args.output_path, "edges_train_dglke.tsv"), header=False, sep="\t") + # persist(edges, join(args.output_path, "edges_train_node2vec.tsv"), header=False, sep=" ") + # persist(eval_sample, join(args.output_path, "edges_eval_dglke.tsv"), header=False, sep="\t") + # persist(eval_sample, join(args.output_path, "edges_eval_node2vec.tsv"), header=False, sep=" ") + # persist(holdout, join(args.output_path, "edges_eval_dglke_10000.tsv"), header=False, sep="\t") + # persist(holdout, join(args.output_path, "edges_eval_node2vec_10000.tsv"), header=False, sep=" ") + + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/SourceCodeTools/models/nlp/TFDecoder.py b/SourceCodeTools/models/nlp/TFDecoder.py new file mode 100644 index 00000000..0c313c8f --- /dev/null +++ b/SourceCodeTools/models/nlp/TFDecoder.py @@ -0,0 +1,122 @@ +import tensorflow as tf +from tensorflow.python.keras.layers import Layer, Dense, Dropout +from tensorflow_addons.layers import MultiHeadAttention + +from SourceCodeTools.models.nlp.common import positional_encoding + + +class ConditionalDecoderLayer(tf.keras.layers.Layer): + def __init__(self, d_model, num_heads, dff, rate=0.1): + super(ConditionalDecoderLayer, self).__init__() + + def point_wise_feed_forward_network(d_model, dff): + return tf.keras.Sequential([ + tf.keras.layers.Dense(dff, activation='relu'), # (batch_size, seq_len, dff) + tf.keras.layers.Dense(d_model) # (batch_size, seq_len, d_model) + ]) + + self.mha1 = MultiHeadAttention(d_model, num_heads, return_attn_coef=True) + self.mha2 = MultiHeadAttention(d_model, num_heads, return_attn_coef=True) + + self.ffn = point_wise_feed_forward_network(d_model, dff) + + self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6) + self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6) + self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6) + + self.dropout1 = tf.keras.layers.Dropout(rate) + self.dropout2 = tf.keras.layers.Dropout(rate) + self.dropout3 = tf.keras.layers.Dropout(rate) + + def call(self, inputs, look_ahead_mask=None, mask=None, training=None): + encoder_out, x = inputs + # enc_output.shape == (batch_size, input_seq_len, d_model) + + attn1, attn_weights_block1 = self.mha1((x, x, x), mask=look_ahead_mask) # (batch_size, target_seq_len, d_model) + attn1 = self.dropout1(attn1, training=training) + out1 = self.layernorm1(attn1 + x) # skip connection + + attn2, attn_weights_block2 = self.mha2( + (out1, encoder_out, encoder_out), mask=tf.tile(tf.expand_dims(encoder_out._keras_mask, axis=1), (1,out1.shape[1],1))) # (batch_size, target_seq_len, d_model) + attn2 = self.dropout2(attn2, training=training) + out2 = self.layernorm2(attn2 + out1) # skip connection (batch_size, target_seq_len, d_model) + + ffn_output = self.ffn(out2) # (batch_size, target_seq_len, d_model) + ffn_output = self.dropout3(ffn_output, training=training) + out3 = self.layernorm3(ffn_output + out2) # skip connection (batch_size, target_seq_len, d_model) + + return out3, attn_weights_block1, attn_weights_block2 + + +class ConditionalAttentionDecoder(tf.keras.layers.Layer): + def __init__(self, input_dim, out_dim, num_layers, num_heads, ff_hidden, target_vocab_size, + maximum_position_encoding, rate=0.1): + super(ConditionalAttentionDecoder, self).__init__() + + self.d_model = out_dim + self.num_layers = num_layers + + self.embedding = tf.keras.layers.Embedding(target_vocab_size, input_dim) + self.pos_encoding = positional_encoding(maximum_position_encoding, input_dim) + + self.dec_layers = [ConditionalDecoderLayer(input_dim, num_heads, ff_hidden, rate) + for _ in range(num_layers)] + self.dropout = tf.keras.layers.Dropout(rate) + + self.look_ahead_mask = self.create_look_ahead_mask(1) + self.fc_out = Dense(out_dim) + + def create_look_ahead_mask(self, size): + mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0) + return mask # (seq_len, seq_len) + + def compute_mask(self, inputs, mask=None): + # encoder_out, target = inputs + return mask + + def call(self, inputs, training=None, mask=None): + if len(inputs) == 2: + encoder_out, target = inputs + elif len(inputs) == 1: + encoder_out = inputs + target = None + else: + raise ValueError("Incorrect number of parameters") + + if target is None: + seq_len = tf.shape(encoder_out)[1] + x = encoder_out + else: + seq_len = tf.shape(target)[1] + x = self.embedding(target) # (batch_size, target_seq_len, d_model) + x += self.pos_encoding[:, :seq_len, :] + + if self.look_ahead_mask.shape[0] != seq_len: + self.look_ahead_mask = self.create_look_ahead_mask(seq_len) + + attention_weights = {} + + x = self.dropout(x, training=training) + + for i in range(self.num_layers): + x, block1, block2 = self.dec_layers[i]( + (encoder_out, x), look_ahead_mask=self.look_ahead_mask, mask=mask, training=training + ) + + attention_weights[f'decoder_layer{i+1}_block1'] = block1 + attention_weights[f'decoder_layer{i+1}_block2'] = block2 + + # x.shape == (batch_size, target_seq_len, d_model) + return self.fc_out(x), attention_weights + + +class FlatDecoder(Layer): + def __init__(self, out_dims, hidden=100, dropout=0.1): + super(FlatDecoder, self).__init__() + self.fc1 = Dense(hidden, activation=tf.nn.relu, kernel_initializer=tf.keras.initializers.HeNormal()) + self.drop = Dropout(rate=dropout) + self.fc2 = Dense(out_dims) + + def call(self, inputs, training=None, mask=None): + encoder_out, target = inputs + return self.fc2(self.drop(self.fc1(encoder_out, training=training), training=training), training=training), None \ No newline at end of file diff --git a/SourceCodeTools/models/nlp/TFEncoder.py b/SourceCodeTools/models/nlp/TFEncoder.py new file mode 100644 index 00000000..c6cf6af0 --- /dev/null +++ b/SourceCodeTools/models/nlp/TFEncoder.py @@ -0,0 +1,238 @@ +from copy import copy + +from scipy.linalg import toeplitz +from tensorflow.keras.layers import Layer, Dense, Conv2D, Flatten, Input, Embedding, concatenate, GRU +from tensorflow.keras import Model +import tensorflow as tf +from tensorflow.python.keras.layers import Dropout + + +class DefaultEmbedding(Layer): + """ + Creates an embedder that provides the default value for the index -1. The default value is a zero-vector + """ + def __init__(self, init_vectors=None, shape=None, trainable=True): + super(DefaultEmbedding, self).__init__() + + if init_vectors is not None: + self.embs = tf.Variable(init_vectors, dtype=tf.float32, + trainable=trainable, name="default_embedder_var") + shape = init_vectors.shape + else: + # TODO + # the default value is no longer constant. need to replace this with a standard embedder + self.embs = tf.Variable(tf.random.uniform(shape=(shape[0], shape[1]), dtype=tf.float32), + name="default_embedder_pad") + self.pad = tf.zeros(shape=(1, shape[1]), name="default_embedder_pad") + # self.pad = tf.random.uniform(shape=(1, init_vectors.shape[1]), name="default_embedder_pad") + # self.pad = tf.Variable(tf.random.uniform(shape=(1, shape[1]), dtype=tf.float32), + # name="default_embedder_pad") + + # def compute_mask(self, inputs, mask=None): + # ids, lengths = inputs + # # position with value -1 is a pad + # return tf.sequence_mask(lengths, ids.shape[1]) + # # return inputs != self.embs.shape[0] + + def call(self, ids): + emb_matr = tf.concat([self.embs, self.pad], axis=0) + return tf.nn.embedding_lookup(params=emb_matr, ids=ids) + # return tf.expand_dims(tf.nn.embedding_lookup(params=self.emb_matr, ids=ids), axis=3) + + +class PositionalEncoding(Model): + def __init__(self, seq_len, pos_emb_size): + """ + Create positional embedding with a trainable embedding matrix. Currently not using because it results + in N^2 computational complexity. Should move this functionality to batch preparation. + :param seq_len: maximum sequence length + :param pos_emb_size: the dimensionality of positional embeddings + """ + super(PositionalEncoding, self).__init__() + + positions = list(range(seq_len * 2)) + position_splt = positions[:seq_len] + position_splt.reverse() + self.position_encoding = tf.constant(toeplitz(position_splt, positions[seq_len:]), + dtype=tf.int32, + name="position_encoding") + # self.position_embedding = tf.random.uniform(name="position_embedding", shape=(seq_len * 2, pos_emb_size), dtype=tf.float32) + self.position_embedding = tf.Variable(tf.random.uniform(shape=(seq_len * 2, pos_emb_size), dtype=tf.float32), + name="position_embedding") + # self.position_embedding = tf.Variable(name="position_embedding", shape=(seq_len * 2, pos_emb_size), dtype=tf.float32) + + def call(self): + # return tf.nn.embedding_lookup(self.position_embedding, self.position_encoding, name="position_lookup") + return tf.nn.embedding_lookup(self.position_embedding, self.position_encoding, name="position_lookup") + + +class TextCnnLayer(Model): + def __init__(self, out_dim, kernel_shape, activation=None): + super(TextCnnLayer, self).__init__() + + self.kernel_shape = kernel_shape + self.out_dim = out_dim + + self.textConv = Conv2D(filters=out_dim, kernel_size=kernel_shape, + activation=activation, data_format='channels_last') + + padding_size = (self.kernel_shape[0] - 1) // 2 + assert padding_size * 2 + 1 == self.kernel_shape[0] + self.pad_constant = tf.constant([[0, 0], [padding_size, padding_size], [0, 0], [0, 0]]) + + self.supports_masking = True + + def call(self, x, training=None, mask=None): + padded = tf.pad(x, self.pad_constant) + # emb_sent_exp = tf.expand_dims(input, axis=3) + convolve = self.textConv(padded) + return tf.squeeze(convolve, axis=-2) + + +class TextCnnEncoder(Model): + """ + TextCnnEncoder model for classifying tokens in a sequence. The model uses following pipeline: + + token_embeddings (provided from outside) -> + several convolutional layers, get representations for all tokens -> + pass representation for all tokens through a dense network -> + classify each token + """ + def __init__(self, + # input_size, + h_sizes, seq_len, + pos_emb_size, cnn_win_size, dense_size, out_dim, + activation=None, dense_activation=None, drop_rate=0.2): + """ + + :param input_size: dimensionality of input embeddings + :param h_sizes: sizes of hidden CNN layers, internal dimensionality of token embeddings + :param seq_len: maximum sequence length + :param pos_emb_size: dimensionality of positional embeddings + :param cnn_win_size: width of cnn window + :param dense_size: number of unius in dense network + :param num_classes: number of output units + :param activation: activation for cnn + :param dense_activation: activation for dense layers + :param drop_rate: dropout rate for dense network + """ + super(TextCnnEncoder, self).__init__() + + self.seq_len = seq_len + self.h_sizes = h_sizes + self.dense_size = dense_size + self.out_dim = out_dim + self.pos_emb_size = pos_emb_size + self.cnn_win_size = cnn_win_size + self.activation = activation + self.dense_activation = dense_activation + self.drop_rate = drop_rate + + self.supports_masking = True + + def build(self, input_shape): + assert len(input_shape) == 3 + + input_size = input_shape[2] + + def infer_kernel_sizes(h_sizes): + """ + Compute kernel sizes from the desired dimensionality of hidden layers + :param h_sizes: + :return: + """ + kernel_sizes = copy(h_sizes) + kernel_sizes.pop(-1) # pop last because it is the output of the last CNN layer + kernel_sizes.insert(0, input_size) # the first kernel size should be (cnn_win_size, input_size) + kernel_sizes = [(self.cnn_win_size, ks) for ks in kernel_sizes] + return kernel_sizes + + kernel_sizes = infer_kernel_sizes(self.h_sizes) + + self.layers_tok = [TextCnnLayer(out_dim=h_size, kernel_shape=kernel_size, activation=self.activation) + for h_size, kernel_size in zip(self.h_sizes, kernel_sizes)] + + # self.layers_pos = [TextCnnLayer(out_dim=h_size, kernel_shape=(cnn_win_size, pos_emb_size), activation=activation) + # for h_size, _ in zip(h_sizes, kernel_sizes)] + + # self.positional = PositionalEncoding(seq_len=seq_len, pos_emb_size=pos_emb_size) + + if self.dense_activation is None: + dense_activation = self.activation + + # self.attention = tfa.layers.MultiHeadAttention(head_size=200, num_heads=1) + + self.dense_1 = Dense(self.dense_size, activation=self.dense_activation) + self.dropout_1 = tf.keras.layers.Dropout(rate=self.drop_rate) + self.dense_2 = Dense(self.out_dim, activation=None) # logits + self.dropout_2 = tf.keras.layers.Dropout(rate=self.drop_rate) + + def compute_mask(self, inputs, mask=None): + return mask + + def call(self, embs, training=True, mask=None): + + temp_cnn_emb = embs # shape (?, seq_len, input_size) + + # pass embeddings through several CNN layers + for l in self.layers_tok: + temp_cnn_emb = l(tf.expand_dims(temp_cnn_emb, axis=3)) # shape (?, seq_len, h_size) + + # TODO + # simplify to one CNN and one attention + + # pos_cnn = self.positional() + # for l in self.layers_pos: + # pos_cnn = l(tf.expand_dims(pos_cnn, axis=3)) + # + # cnn_pool_feat = [] + # for i in range(self.seq_len): + # # slice tensor for the line that corresponds to the current position in the sentence + # position_features = tf.expand_dims(pos_cnn[i, ...], axis=0, name="exp_dim_%d" % i) + # # convolution without activation can be combined later, hence: temp_cnn_emb + position_features + # cnn_pool_feat.append( + # tf.expand_dims(tf.nn.tanh(tf.reduce_max(temp_cnn_emb + position_features, axis=1)), axis=1)) + # # cnn_pool_feat.append( + # # tf.expand_dims(tf.nn.tanh(tf.reduce_max(tf.concat([temp_cnn_emb, position_features], axis=-1), axis=1)), axis=1)) + # + # cnn_pool_features = tf.concat(cnn_pool_feat, axis=1) + cnn_pool_features = temp_cnn_emb + + # cnn_pool_features = self.attention([cnn_pool_features, cnn_pool_features]) + + # token_features = self.dropout_1( + # tf.reshape(cnn_pool_features, shape=(-1, self.h_sizes[-1])) + # , training=training) + + # reshape before passing through a dense network + # token_features = tf.reshape(cnn_pool_features, shape=(-1, self.h_sizes[-1])) # shape (? * seq_len, h_size[-1]) + + # local_h2 = self.dropout_2( + # self.dense_1(token_features) + # , training=training) + local_h2 = self.dense_1(cnn_pool_features) # shape (? * seq_len, dense_size) + tag_logits = self.dense_2(local_h2) # shape (? * seq_len, num_classes) + + return tag_logits # tf.reshape(tag_logits, (-1, seq_len, self.out_dim)) # reshape back, shape (?, seq_len, num_classes) + + +class GRUEncoder(Model): + def __init__(self, input_dim, out_dim=100, num_layers=1, dropout=0.1): + super(GRUEncoder, self).__init__() + self.num_layers = num_layers + + self.gru_layers = [ + tf.keras.layers.Bidirectional(GRU(out_dim, dropout=dropout, return_sequences=True)) for _ in range(num_layers) + ] + + self.dropout = Dropout(dropout) + self.supports_masking = True + + def call(self, inputs, training=None, mask=None): + x = inputs + + for layer in self.gru_layers: + x = layer(x, training=training, mask=mask) + x = self.dropout(x, training=training) + + return x \ No newline at end of file diff --git a/SourceCodeTools/models/nlp/Decoder.py b/SourceCodeTools/models/nlp/TorchDecoder.py similarity index 90% rename from SourceCodeTools/models/nlp/Decoder.py rename to SourceCodeTools/models/nlp/TorchDecoder.py index 1ba6f128..dae8fc21 100644 --- a/SourceCodeTools/models/nlp/Decoder.py +++ b/SourceCodeTools/models/nlp/TorchDecoder.py @@ -1,25 +1,33 @@ +import logging + import torch from torch import nn from torch.nn import Embedding import torch.nn.functional as F from torch.autograd import Variable +from SourceCodeTools.models.nlp.common import positional_encoding + class Decoder(nn.Module): - def __init__(self, encoder_out_dim, decoder_dim, out_dim, vocab_size, nheads=1, layers=1): + def __init__(self, encoder_out_dim, decoder_dim, out_dim, vocab_size, seq_len, nheads=1, layers=1): super(Decoder, self).__init__() self.encoder_adapter = nn.Linear(encoder_out_dim, decoder_dim) self.embed = nn.Embedding(vocab_size, decoder_dim) self.decoder_layer = nn.TransformerDecoderLayer(decoder_dim, nheads, dim_feedforward=decoder_dim) self.decoder = nn.TransformerDecoder(self.decoder_layer, num_layers=layers) self.mask = self.generate_square_subsequent_mask(1) + self.seq_len = seq_len + + self.position_encoding = positional_encoding(self.seq_len, decoder_dim).permute(1, 0, 2) self.fc = nn.Linear(decoder_dim, out_dim) def forward(self, encoder_out, target): encoder_out = self.encoder_adapter(encoder_out).permute(1, 0, 2) target = self.embed(target).permute(1, 0, 2) - if self.mask.size(0) != target.size(0): + target = target + self.position_encoding[:target.shape[0]] + if self.mask.size(0) != target.size(0): # for self-attention self.mask = self.generate_square_subsequent_mask(target.size(0)).to(encoder_out.device) out = self.decoder(target, encoder_out, tgt_mask=self.mask) @@ -64,6 +72,7 @@ class LSTMDecoder(nn.Module): def __init__(self, num_buckets, padding=0, encoder_embed_dim=100, embed_dim=100, out_embed_dim=100, num_layers=1, dropout_in=0.1, dropout_out=0.1, use_cuda=True): + embed_dim = out_embed_dim = encoder_embed_dim super(LSTMDecoder, self).__init__() self.use_cuda = use_cuda self.dropout_in = dropout_in @@ -73,7 +82,7 @@ def __init__(self, num_buckets, padding=0, encoder_embed_dim=100, embed_dim=100, padding_idx = padding self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) - self.create_layers(embed_dim, embed_dim, num_layers) + self.create_layers(encoder_embed_dim, embed_dim, num_layers) self.attention = AttentionLayer(out_embed_dim, encoder_embed_dim, embed_dim) if embed_dim != out_embed_dim: @@ -82,10 +91,14 @@ def __init__(self, num_buckets, padding=0, encoder_embed_dim=100, embed_dim=100, def create_layers(self, encoder_embed_dim, embed_dim, num_layers): self.layers = nn.ModuleList([ - LSTMCell(encoder_embed_dim + embed_dim if layer == 0 else embed_dim, embed_dim) + LSTMCell(encoder_embed_dim + embed_dim if layer == 0 else encoder_embed_dim, encoder_embed_dim if layer == 0 else embed_dim) for layer in range(num_layers) ]) + self.norms = nn.ModuleList([ + nn.LayerNorm(encoder_embed_dim + embed_dim if layer == 0 else encoder_embed_dim) for layer in range(num_layers) + ]) + def forward(self, prev_output_tokens, encoder_out, incremental_state=None, inference=False): if incremental_state is not None: # TODO what is this? prev_output_tokens = prev_output_tokens[:, -1:] @@ -112,9 +125,9 @@ def forward(self, prev_output_tokens, encoder_out, incremental_state=None, infer else: # _, encoder_hiddens, encoder_cells = encoder_out num_layers = len(self.layers) - prev_hiddens = [Variable(x.data.new(bsz, embed_dim).zero_()) for i in range(num_layers)] - prev_cells = [Variable(x.data.new(bsz, embed_dim).zero_()) for i in range(num_layers)] - input_feed = Variable(x.data.new(bsz, embed_dim).zero_()) + prev_hiddens = [Variable(x.data.new(bsz, embed_dim).zero_()) if i != 0 else encoder_outs[0] for i in range(num_layers)] + prev_cells = [Variable(x.data.new(bsz, embed_dim if i!=0 else encoder_outs.size(-1)).zero_()) for i in range(num_layers)] + input_feed = Variable(x.data.new(bsz, encoder_outs.size(-1)).zero_()) attn_scores = Variable(x.data.new(srclen, seqlen, bsz).zero_()) outs = [] @@ -122,8 +135,9 @@ def forward(self, prev_output_tokens, encoder_out, incremental_state=None, infer # input feeding: concatenate context vector from previous time step input = torch.cat((x[j, :, :], input_feed), dim=1) - for i, rnn in enumerate(self.layers): + for i, (rnn, lnorm) in enumerate(zip(self.layers, self.norms)): # recurrent cell + input = lnorm(input) hidden, cell = rnn(input, (prev_hiddens[i], prev_cells[i])) # hidden state becomes the input to the next layer diff --git a/SourceCodeTools/models/nlp/Encoder.py b/SourceCodeTools/models/nlp/TorchEncoder.py similarity index 97% rename from SourceCodeTools/models/nlp/Encoder.py rename to SourceCodeTools/models/nlp/TorchEncoder.py index 82adf0c3..938d9efb 100644 --- a/SourceCodeTools/models/nlp/Encoder.py +++ b/SourceCodeTools/models/nlp/TorchEncoder.py @@ -9,7 +9,7 @@ class Encoder(nn.Module): def __init__(self, encoder_dim, out_dim, nheads=1, layers=1): super(Encoder, self).__init__() # self.embed = nn.Embedding(vocab_size, encoder_dim) - self.encoder_lauer = nn.TransformerEncoderLayer(encoder_dim, nheads, dim_feedforward=encoder_dim) + self.encoder_layer = nn.TransformerEncoderLayer(encoder_dim, nheads, dim_feedforward=encoder_dim) self.encoder = nn.TransformerEncoder(self.encoder_lauer, num_layers=layers) self.out_adapter = nn.Linear(encoder_dim, out_dim) @@ -29,7 +29,6 @@ def forward(self, input, lengths=None): return out.permute(1, 0, 2) - class LSTMEncoder(nn.Module): """LSTM encoder.""" def __init__(self, embed_dim=100, num_layers=1, dropout_in=0.1, dropout_out=0.1): diff --git a/SourceCodeTools/models/nlp/common.py b/SourceCodeTools/models/nlp/common.py new file mode 100644 index 00000000..1004b0b1 --- /dev/null +++ b/SourceCodeTools/models/nlp/common.py @@ -0,0 +1,20 @@ +import numpy as np + +def positional_encoding(position, d_model): + def get_angles(pos, i, d_model): + angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model)) + return pos * angle_rates + + angle_rads = get_angles(np.arange(position)[:, np.newaxis], + np.arange(d_model)[np.newaxis, :], + d_model) + + # apply sin to even indices in the array; 2i + angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2]) + + # apply cos to odd indices in the array; 2i+1 + angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2]) + + pos_encoding = angle_rads[np.newaxis, ...] + return pos_encoding + # return tf.cast(pos_encoding, dtype=tf.float32) \ No newline at end of file diff --git a/SourceCodeTools/models/training_config.py b/SourceCodeTools/models/training_config.py new file mode 100644 index 00000000..9e7ebd4b --- /dev/null +++ b/SourceCodeTools/models/training_config.py @@ -0,0 +1,126 @@ +from copy import copy + +import yaml + + +config_specification = { + "DATASET": { + "data_path": None, + "train_frac": 0.9, + "filter_edges": None, + "min_count_for_objectives": 5, + # "packages_file": None, # partition file + "self_loops": False, + "use_node_types": False, + "use_edge_types": False, + "no_global_edges": False, + "remove_reverse": False, + "custom_reverse": None, + "restricted_id_pool": None, + "random_seed": None, + "subgraph_id_column": "mentioned_in", + "subgraph_partition": None + }, + "TRAINING": { + "model_output_dir": None, + "pretrained": None, + "pretraining_phase": 0, + + "sampling_neighbourhood_size": 10, + "neg_sampling_factor": 3, + "use_layer_scheduling": False, + "schedule_layers_every": 10, + + "elem_emb_size": 100, + "embedding_table_size": 200000, + + "epochs": 100, + "batch_size": 128, + "learning_rate": 1e-3, + + "objectives": None, + "save_each_epoch": False, + "save_checkpoints": True, + "early_stopping": False, + "early_stopping_tolerance": 20, + + "force_w2v_ns": False, + "use_ns_groups": False, + "nn_index": "brute", + + "metric": "inner_prod", + + "measure_scores": False, + "dilate_scores": 200, # downsample + + "gpu": -1, + + "external_dataset": None, + + "restore_state": False, + }, + "MODEL": { + "node_emb_size": 100, + "h_dim": 100, + "n_layers": 5, + "use_self_loop": True, + + "use_gcn_checkpoint": False, + "use_att_checkpoint": False, + "use_gru_checkpoint": False, + + 'num_bases': 10, + 'dropout': 0.0, + + 'activation': "tanh", + # torch.nn.functional.hardswish], #[torch.nn.functional.hardtanh], #torch.nn.functional.leaky_relu + }, + "TOKENIZER": { + "tokenizer_path": None, + } +} + + +def default_config(): + config = copy(config_specification) + + assert len( + set(config["MODEL"].keys()) | + set(config["TRAINING"].keys()) | + set(config["DATASET"].keys()) + ) == ( + len(set(config["MODEL"].keys())) + + len(set(config["TRAINING"].keys())) + + len(set(config["DATASET"].keys())) + ), "All parameter names in the configuration should be unique" + return config + + +def get_config(**kwargs): + config = default_config() + return update_config(config, **kwargs) + + +def update_config(config, **kwargs): + recognized_options = set() + + for section, args in config.items(): + for key, value in kwargs.items(): + if key in args: + args[key] = value + recognized_options.add(key) + + unrecognized = {key: value for key, value in kwargs.items() if key not in recognized_options} + + if len(unrecognized) > 0: + raise ValueError(f"Some configuration options are not recognized: {unrecognized}") + + return config + + +def save_config(config, path): + yaml.dump(config, open(path, "w")) + + +def load_config(path): + return yaml.load(open(path, "r").read(), Loader=yaml.Loader) \ No newline at end of file diff --git a/SourceCodeTools/models/training_options.py b/SourceCodeTools/models/training_options.py new file mode 100644 index 00000000..2100f4ff --- /dev/null +++ b/SourceCodeTools/models/training_options.py @@ -0,0 +1,85 @@ +def add_data_arguments(parser): + parser.add_argument('--data_path', '-d', dest='data_path', default=None, help='Path to folder with dataset') + parser.add_argument('--train_frac', dest='train_frac', default=0.9, type=float, help='Fraction of nodes to be used for training') + parser.add_argument('--filter_edges', dest='filter_edges', default=None, help='Comma separated list of edges that should be filtered from the graph') + parser.add_argument('--min_count_for_objectives', dest='min_count_for_objectives', default=5, type=int, help='Filter all target examples that occurr less than set numbers of times') + # parser.add_argument('--packages_file', dest='packages_file', default=None, type=str, help='???') + parser.add_argument('--self_loops', action='store_true', help='Add self loops to the graph') + parser.add_argument('--use_node_types', action='store_true', help='Add node types to the graph') + parser.add_argument('--use_edge_types', action='store_true', help='Add edge types to the graph') + parser.add_argument('--restore_state', action='store_true', help='Load from checkpoint') + parser.add_argument('--no_global_edges', action='store_true', help='Remove all global edges from the graph') + parser.add_argument('--remove_reverse', action='store_true', help="Remove reverse edges from the graph") + parser.add_argument('--custom_reverse', dest='custom_reverse', default=None, help='List of edges for which to add reverse types. Should use together with `remove_reverse`') + parser.add_argument('--restricted_id_pool', dest='restricted_id_pool', default=None, help='???') + parser.add_argument('--subgraph_partition', default=None) + parser.add_argument('--subgraph_id_column', default=None) + + +def add_pretraining_arguments(parser): + parser.add_argument('--pretrained', '-p', dest='pretrained', default=None, help='Path to pretrained subtoken vectors') + parser.add_argument('--tokenizer_path', '-t', dest='tokenizer_path', default=None, help='???') + parser.add_argument('--pretraining_phase', dest='pretraining_phase', default=0, type=int, help='Number of epochs for pretraining') + + +def add_training_arguments(parser): + parser.add_argument('--embedding_table_size', dest='embedding_table_size', default=200000, type=int, help='Bucket size for the embedding table. Overriden when pretrained vectors provided???') + parser.add_argument('--random_seed', dest='random_seed', default=None, type=int, help='Random seed for generating dataset splits') + + parser.add_argument('--node_emb_size', dest='node_emb_size', default=100, type=int, help='Dimensionality of node embeddings') + parser.add_argument('--elem_emb_size', dest='elem_emb_size', default=100, type=int, help='Dimensionality of target embeddings (node names). Should match node embeddings when cosine distance loss is used') + parser.add_argument('--sampling_neighbourhood_size', dest='sampling_neighbourhood_size', default=10, type=int, help='Number of dependencies to sample per node') + parser.add_argument('--neg_sampling_factor', dest='neg_sampling_factor', default=3, type=int, help='Number of negative samples for each positive') + + parser.add_argument('--use_layer_scheduling', action='store_true', help='???') + parser.add_argument('--schedule_layers_every', dest='schedule_layers_every', default=10, type=int, help='???') + + parser.add_argument('--epochs', dest='epochs', default=100, type=int, help='Number of epochs') + parser.add_argument('--batch_size', dest='batch_size', default=128, type=int, help='Batch size') + + parser.add_argument("--h_dim", dest="h_dim", default=None, type=int, help='Should be the same as `node_emb_size`') + parser.add_argument("--n_layers", dest="n_layers", default=5, type=int, help='Number of layers') + parser.add_argument("--objectives", dest="objectives", default=None, type=str, help='???') + + parser.add_argument("--save_each_epoch", action="store_true", help='Save checkpoints for each epoch (high disk space utilization)') + parser.add_argument("--early_stopping", action="store_true", help='???') + parser.add_argument("--early_stopping_tolerance", default=20, type=int, help='???') + parser.add_argument("--force_w2v_ns", action="store_true", help='Use w2v negative sampling strategy p_unigram^(3/4)') + parser.add_argument("--use_ns_groups", action="store_true", help='Perform negative sampling only from closest neighbours???') + + parser.add_argument("--metric", default="inner_prod", type=str, help='???') + parser.add_argument("--nn_index", default="brute", type=str, help='Index backend for generating negative samples???') + + parser.add_argument("--external_dataset", default=None, type=str, help='Path to external graph, use for inference') + + +def add_scoring_arguments(parser): + parser.add_argument('--measure_scores', action='store_true') + parser.add_argument('--dilate_scores', dest='dilate_scores', default=200, type=int, help='') + + +def add_performance_arguments(parser): + parser.add_argument('--no_checkpoints', dest="save_checkpoints", action='store_false') + + parser.add_argument('--use_gcn_checkpoint', action='store_true') + parser.add_argument('--use_att_checkpoint', action='store_true') + parser.add_argument('--use_gru_checkpoint', action='store_true') + + +def add_gnn_train_args(parser): + parser.add_argument("--config", default=None, help="Path to config file") + add_data_arguments(parser) + add_pretraining_arguments(parser) + add_training_arguments(parser) + add_scoring_arguments(parser) + add_performance_arguments(parser) + + # parser.add_argument('--note', dest='note', default="", help='Note, added to metadata') + parser.add_argument('model_output_dir', help='Location of the final model') + + # parser.add_argument('--intermediate_supervision', action='store_true') + parser.add_argument('--gpu', dest='gpu', default=-1, type=int, help='') + + +def verify_arguments(args): + pass \ No newline at end of file diff --git a/SourceCodeTools/nlp/batchers/PythonBatcher.py b/SourceCodeTools/nlp/batchers/PythonBatcher.py index e6ff2049..9dabe37c 100644 --- a/SourceCodeTools/nlp/batchers/PythonBatcher.py +++ b/SourceCodeTools/nlp/batchers/PythonBatcher.py @@ -1,21 +1,26 @@ import json import os import shelve +import shutil import tempfile -from functools import lru_cache +from collections import defaultdict +from time import time from math import ceil from typing import Dict, Optional, List -from spacy.gold import biluo_tags_from_offsets +import spacy -from SourceCodeTools.code.ast_tools import get_declarations +from SourceCodeTools.code.ast.ast_tools import get_declarations from SourceCodeTools.models.ClassWeightNormalizer import ClassWeightNormalizer from SourceCodeTools.nlp import create_tokenizer, tag_map_from_sentences, TagMap, token_hasher, try_int from SourceCodeTools.nlp.entity import fix_incorrect_tags +from SourceCodeTools.code.annotator_utils import adjust_offsets, biluo_tags_from_offsets from SourceCodeTools.nlp.entity.utils import overlap - import numpy as np + + + def filter_unlabeled(entities, declarations): """ Get a list of declarations that were not mentioned in `entities` @@ -33,37 +38,46 @@ def filter_unlabeled(entities, declarations): return for_mask +def print_token_tag(doc, tags): + for t, tag in zip(doc, tags): + print(t, "\t", tag) + + class PythonBatcher: def __init__( self, data, batch_size: int, seq_len: int, - graphmap: Dict[str, int], wordmap: Dict[str, int], tagmap: Optional[TagMap] = None, + wordmap: Dict[str, int], *, graphmap: Optional[Dict[str, int]], tagmap: Optional[TagMap] = None, mask_unlabeled_declarations=True, - class_weights=False, element_hash_size=1000 + class_weights=False, element_hash_size=1000, len_sort=True, tokenizer="spacy", no_localization=False ): self.create_cache() - self.data = data + self.data = sorted(data, key=lambda x: len(x[0])) if len_sort else data self.batch_size = batch_size self.seq_len = seq_len self.class_weights = None self.mask_unlabeled_declarations = mask_unlabeled_declarations + self.tokenizer = tokenizer + if tokenizer == "codebert": + self.vocab = spacy.blank("en").vocab + self.no_localization = no_localization - self.nlp = create_tokenizer("spacy") + self.nlp = create_tokenizer(tokenizer) if tagmap is None: self.tagmap = tag_map_from_sentences(list(zip(*[self.prepare_sent(sent) for sent in data]))[1]) else: self.tagmap = tagmap - self.graphpad = len(graphmap) + self.graphpad = len(graphmap) if graphmap is not None else None self.wordpad = len(wordmap) self.tagpad = self.tagmap["O"] self.prefpad = element_hash_size self.suffpad = element_hash_size - self.graphmap_func = lambda g: graphmap.get(g, len(graphmap)) + self.graphmap_func = (lambda g: graphmap.get(g, len(graphmap))) if graphmap is not None else None self.wordmap_func = lambda w: wordmap.get(w, len(wordmap)) - self.tagmap_func = lambda t: self.tagmap.get(t, len(self.tagmap)) + self.tagmap_func = lambda t: self.tagmap.get(t, self.tagmap["O"]) self.prefmap_func = lambda w: token_hasher(w[:3], element_hash_size) self.suffmap_func = lambda w: token_hasher(w[-3:], element_hash_size) @@ -81,10 +95,26 @@ def __init__( else: self.classw_func = lambda t: 1. + def __del__(self): + self.sent_cache.close() + self.batch_cache.close() + + from shutil import rmtree + rmtree(self.tmp_dir, ignore_errors=True) + def create_cache(self): - self.tmp_dir = tempfile.TemporaryDirectory() - self.sent_cache = shelve.open(os.path.join(self.tmp_dir.name, "sent_cache.db")) - self.batch_cache = shelve.open(os.path.join(self.tmp_dir.name, "batch_cache.db")) + char_ranges = [chr(i) for i in range(ord("a"), ord("a")+26)] + [chr(i) for i in range(ord("A"), ord("A")+26)] + [chr(i) for i in range(ord("0"), ord("0")+10)] + from random import sample + rnd_name = "".join(sample(char_ranges, k=10)) + str(int(time() * 1e6)) + time() + + self.tmp_dir = os.path.join(tempfile.gettempdir(), rnd_name) + if os.path.isdir(self.tmp_dir): + shutil.rmtree(self.tmp_dir) + os.mkdir(self.tmp_dir) + + self.sent_cache = shelve.open(os.path.join(self.tmp_dir, "sent_cache")) + self.batch_cache = shelve.open(os.path.join(self.tmp_dir, "batch_cache")) def num_classes(self): return len(self.tagmap) @@ -106,16 +136,48 @@ def prepare_sent(self, sent): unlabeled_dec = filter_unlabeled(ents, get_declarations(text)) tokens = [t.text for t in doc] - ents_tags = biluo_tags_from_offsets(doc, ents) - repl_tags = biluo_tags_from_offsets(doc, repl) + + if self.tokenizer == "codebert": + backup_tokens = doc + fixed_spaces = [False] + fixed_words = [""] + + for ind, t in enumerate(doc): + if len(t.text) > 1: + fixed_words.append(t.text.strip("Ġ")) + else: + fixed_words.append(t.text) + if ind != 0: + fixed_spaces.append(t.text.startswith("Ġ") and len(t.text) > 1) + fixed_spaces.append(False) + fixed_spaces.append(False) + fixed_words.append("") + + assert len(fixed_spaces) == len(fixed_words) + + from spacy.tokens import Doc + doc = Doc(self.vocab, fixed_words, fixed_spaces) + + assert len(doc) - 2 == len(backup_tokens) + assert len(doc.text) - 7 == len(backup_tokens.text) + ents = adjust_offsets(ents, -3) + repl = adjust_offsets(repl, -3) + if self.mask_unlabeled_declarations: + unlabeled_dec = adjust_offsets(unlabeled_dec, -3) + + ents_tags = biluo_tags_from_offsets(doc, ents, self.no_localization) + repl_tags = biluo_tags_from_offsets(doc, repl, self.no_localization) if self.mask_unlabeled_declarations: - unlabeled_dec = biluo_tags_from_offsets(doc, unlabeled_dec) + unlabeled_dec = biluo_tags_from_offsets(doc, unlabeled_dec, self.no_localization) fix_incorrect_tags(ents_tags) fix_incorrect_tags(repl_tags) if self.mask_unlabeled_declarations: fix_incorrect_tags(unlabeled_dec) + if self.tokenizer == "codebert": + tokens = [""] + [t.text for t in backup_tokens] + [""] + assert len(tokens) == len(ents_tags) == len(repl_tags) if self.mask_unlabeled_declarations: assert len(tokens) == len(unlabeled_dec) @@ -151,7 +213,7 @@ def encode(seq, encode_func, pad): pref = encode(sent, self.prefmap_func, self.prefpad) suff = encode(sent, self.suffmap_func, self.suffpad) s = encode(sent, self.wordmap_func, self.wordpad) - r = encode(repl, self.graphmap_func, self.graphpad) # TODO test + r = encode(repl, self.graphmap_func, self.graphpad) if self.graphmap_func is not None else None # labels t = encode(tags, self.tagmap_func, self.tagpad) @@ -160,38 +222,56 @@ def encode(seq, encode_func, pad): hidem = encode( list(range(len(sent))) if unlabeled_decls is None else unlabeled_decls, self.mask_unlbl_func, self.mask_unlblpad - ) + ).astype(np.bool) # class weights classw = encode(tags, self.classw_func, self.classwpad) - assert len(s) == len(r) == len(pref) == len(suff) == len(t) == len(classw) == len(hidem) + assert len(s) == len(pref) == len(suff) == len(t) == len(classw) == len(hidem) + if r is not None: + assert len(r) == len(s) + + no_localization_mask = np.array([tag != self.tagpad for tag in t]).astype(np.bool) output = { + "toks": sent, "tok_ids": s, - "graph_ids": r, + "replacements": repl, + # "graph_ids": r, "prefix": pref, "suffix": suff, "tags": t, "class_weights": classw, "hide_mask": hidem, - "lens": len(s) if len(s) < self.seq_len else self.seq_len + "no_loc_mask": no_localization_mask, + "lens": len(sent) if len(sent) < self.seq_len else self.seq_len } + if r is not None: + output["graph_ids"] = r + self.batch_cache[input_json] = output return output def format_batch(self, batch): - fbatch = { - "tok_ids": [], "graph_ids": [], "prefix": [], "suffix": [], - "tags": [], "class_weights": [], "hide_mask": [], "lens": [] - } + # fbatch = { + # "tok_ids": [], "graph_ids": [], "prefix": [], "suffix": [], + # "tags": [], "class_weights": [], "hide_mask": [], "lens": [], "replacements": [] + # } + fbatch = defaultdict(list) for sent in batch: for key, val in sent.items(): fbatch[key].append(val) - return {key: np.stack(val) if key != "lens" else np.array(val, dtype=np.int32) for key, val in fbatch.items()} + if len(fbatch["graph_ids"]) == 0: + fbatch.pop("graph_ids") + + max_len = max(fbatch["lens"]) + + return { + key: np.stack(val)[:,:max_len] if key != "lens" and key != "replacements" and key != "toks" + else (np.array(val, dtype=np.int32) if key == "lens" else np.array(val)) for key, val in fbatch.items()} def generate_batches(self): batch = [] @@ -200,7 +280,9 @@ def generate_batches(self): if len(batch) >= self.batch_size: yield self.format_batch(batch) batch = [] - yield self.format_batch(batch) + if len(batch) > 0: + yield self.format_batch(batch) + # yield self.format_batch(batch) def __iter__(self): return self.generate_batches() diff --git a/SourceCodeTools/nlp/batchers/PythonBatcherWithMentions.py b/SourceCodeTools/nlp/batchers/PythonBatcherWithMentions.py index c9fa2c53..a84a3fcd 100644 --- a/SourceCodeTools/nlp/batchers/PythonBatcherWithMentions.py +++ b/SourceCodeTools/nlp/batchers/PythonBatcherWithMentions.py @@ -5,7 +5,7 @@ import numpy as np from spacy.gold import biluo_tags_from_offsets -from SourceCodeTools.code.ast_tools import get_declarations +from SourceCodeTools.code.ast.ast_tools import get_declarations from SourceCodeTools.nlp import TagMap, try_int from SourceCodeTools.nlp.batchers import PythonBatcher from SourceCodeTools.nlp.entity import fix_incorrect_tags diff --git a/SourceCodeTools/nlp/codebert/codebert_extract.py b/SourceCodeTools/nlp/codebert/codebert_extract.py new file mode 100644 index 00000000..eb7ccec6 --- /dev/null +++ b/SourceCodeTools/nlp/codebert/codebert_extract.py @@ -0,0 +1,118 @@ +import pickle + +import torch +from tqdm import tqdm +from transformers import RobertaTokenizer, RobertaModel + +from SourceCodeTools.models.Embedder import Embedder +from SourceCodeTools.nlp.entity.type_prediction import get_type_prediction_arguments, ModelTrainer +from SourceCodeTools.nlp.entity.utils.data import read_data + + +def load_typed_nodes(path): + from SourceCodeTools.code.data.file_utils import unpersist + type_ann = unpersist(path) + + filter_rule = lambda name: "0x" not in name + + type_ann = type_ann[ + type_ann["dst"].apply(filter_rule) + ] + + typed_nodes = set(type_ann["src"].tolist()) + return typed_nodes + + +class CodeBertModelTrainer(ModelTrainer): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def set_type_ann_edges(self, path): + self.type_ann_edges = path + + def get_batcher(self, *args, **kwargs): + kwargs.update({"tokenizer": "codebert"}) + return self.batcher(*args, **kwargs) + + def train_model(self): + # graph_emb = load_pkl_emb(self.graph_emb_path) if self.graph_emb_path is not None else None + + typed_nodes = load_typed_nodes(self.type_ann_edges) + + decoder_mapping = RobertaTokenizer.from_pretrained("microsoft/codebert-base").decoder + tok_ids, words = zip(*decoder_mapping.items()) + vocab_mapping = dict(zip(words, tok_ids)) + batcher = self.get_batcher( + self.train_data + self.test_data, self.batch_size, seq_len=self.seq_len, + graphmap=None, + wordmap=vocab_mapping, tagmap=None, + class_weights=False, element_hash_size=1 + ) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model = RobertaModel.from_pretrained("microsoft/codebert-base") + model.to(device) + + node_ids = [] + embeddings = [] + + added = set() + + for ind, batch in enumerate(tqdm(batcher)): + # token_ids, graph_ids, labels, class_weights, lengths = b + token_ids = torch.LongTensor(batch["tok_ids"]) + lens = torch.LongTensor(batch["lens"]) + + token_ids[token_ids == len(vocab_mapping)] = vocab_mapping[""] + + def get_length_mask(target, lens): + mask = torch.arange(target.size(1)).to(target.device)[None, :] < lens[:, None] + return mask + + mask = get_length_mask(token_ids, lens) + with torch.no_grad(): + embs = model(input_ids=token_ids, attention_mask=mask) + + for s_emb, s_repl in zip(embs.last_hidden_state, batch["replacements"]): + unique_repls = set(list(s_repl)) + repls_for_ann = [r for r in unique_repls if r in typed_nodes] + + for r in repls_for_ann: + position = s_repl.index(r) + if position > 512: + continue + if r not in added: + node_ids.append(r) + embeddings.append(s_emb[position]) + added.add(r) + + all_embs = torch.stack(embeddings, dim=0).numpy() + embedder = Embedder(dict(zip(node_ids, range(len(node_ids)))), all_embs) + pickle.dump(embedder, open("codebert_embeddings.pkl", "wb"), fix_imports=False) + print(node_ids) + + +def main(): + # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + # tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base") + model = RobertaModel.from_pretrained("microsoft/codebert-base") + # model.to(device) + args = get_type_prediction_arguments() + + # allowed = {'str', 'bool', 'Optional', 'None', 'int', 'Any', 'Union', 'List', 'Dict', 'Callable', 'ndarray', + # 'FrameOrSeries', 'bytes', 'DataFrame', 'Matcher', 'float', 'Tuple', 'bool_t', 'Description', 'Type'} + + train_data, test_data = read_data( + open(args.data_path, "r").readlines(), normalize=True, allowed=None, include_replacements=True, + include_only="entities", + min_entity_count=args.min_entity_count, random_seed=args.random_seed + ) + + trainer = CodeBertModelTrainer(train_data, test_data, params={}, seq_len=512) + trainer.set_type_ann_edges(args.type_ann_edges) + trainer.train_model() + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/SourceCodeTools/nlp/codebert/codebert_train.py b/SourceCodeTools/nlp/codebert/codebert_train.py new file mode 100644 index 00000000..e166786f --- /dev/null +++ b/SourceCodeTools/nlp/codebert/codebert_train.py @@ -0,0 +1,421 @@ +import json +import os +import pickle +from datetime import datetime +from time import time + +import torch +from torch.utils.tensorboard import SummaryWriter +from torch.version import cuda +from tqdm import tqdm +from transformers import RobertaTokenizer, RobertaModel + +from SourceCodeTools.models.Embedder import Embedder +from SourceCodeTools.nlp.codebert.codebert import CodeBertModelTrainer, load_typed_nodes +from SourceCodeTools.nlp.entity.type_prediction import get_type_prediction_arguments, ModelTrainer, load_pkl_emb, \ + scorer, filter_labels +from SourceCodeTools.nlp.entity.utils.data import read_data + +import torch.nn as nn + +class CodebertHybridModel(nn.Module): + def __init__( + self, codebert_model, graph_emb, padding_idx, num_classes, dense_hidden=100, dropout=0.1, bert_emb_size=768, + no_graph=False + ): + super(CodebertHybridModel, self).__init__() + + self.codebert_model = codebert_model + self.use_graph = not no_graph + + num_emb = padding_idx + 1 # padding id is usually not a real embedding + emb_dim = graph_emb.shape[1] + self.graph_emb = nn.Embedding(num_embeddings=num_emb, embedding_dim=emb_dim, padding_idx=padding_idx) + + import numpy as np + pretrained_embeddings = torch.from_numpy(np.concatenate([graph_emb, np.zeros((1, emb_dim))], axis=0)).float() + new_param = torch.nn.Parameter(pretrained_embeddings) + assert self.graph_emb.weight.shape == new_param.shape + self.graph_emb.weight = new_param + self.graph_emb.weight.requires_grad = False + + self.fc1 = nn.Linear( + bert_emb_size + (emb_dim if self.use_graph else 0), + dense_hidden + ) + self.drop = nn.Dropout(dropout) + self.fc2 = nn.Linear(dense_hidden, num_classes) + + self.loss_f = nn.CrossEntropyLoss(reduction="mean") + + def forward(self, token_ids, graph_ids, mask, finetune=False): + if finetune: + x = self.codebert_model(input_ids=token_ids, attention_mask=mask).last_hidden_state + else: + with torch.no_grad(): + x = self.codebert_model(input_ids=token_ids, attention_mask=mask).last_hidden_state + + if self.use_graph: + graph_emb = self.graph_emb(graph_ids) + x = torch.cat([x, graph_emb], dim=-1) + + x = torch.relu(self.fc1(x)) + x = self.drop(x) + x = self.fc2(x) + + return x + + def loss(self, logits, labels, mask, class_weights=None, extra_mask=None): + if extra_mask is not None: + mask = torch.logical_and(mask, extra_mask) + logits = logits[mask, :] + labels = labels[mask] + loss = self.loss_f(logits, labels) + # if class_weights is None: + # loss = tf.reduce_mean(tf.boolean_mask(losses, seq_mask)) + # else: + # loss = tf.reduce_mean(tf.boolean_mask(losses * class_weights, seq_mask)) + + return loss + + def score(self, logits, labels, mask, scorer=None, extra_mask=None): + if extra_mask is not None: + mask = torch.logical_and(mask, extra_mask) + true_labels = labels[mask] + argmax = logits.argmax(-1) + estimated_labels = argmax[mask] + + p, r, f1 = scorer(to_numpy(estimated_labels), to_numpy(true_labels)) + + return p, r, f1 + + +def get_length_mask(target, lens): + mask = torch.arange(target.size(1)).to(target.device)[None, :] < lens[:, None] + return mask + + +def batch_to_torch(batch, device): + key_types = { + 'tok_ids': torch.LongTensor, + 'tags': torch.LongTensor, + 'hide_mask': torch.BoolTensor, + 'no_loc_mask': torch.BoolTensor, + 'lens': torch.LongTensor, + 'graph_ids': torch.LongTensor + } + for key, tf in key_types.items(): + batch[key] = tf(batch[key]).to(device) + + +def to_numpy(tensor): + return tensor.cpu().detach().numpy() + + +def train_step_finetune(model, optimizer, token_ids, prefix, suffix, graph_ids, labels, lengths, + extra_mask=None, class_weights=None, scorer=None, finetune=False, vocab_mapping=None): + token_ids[token_ids == len(vocab_mapping)] = vocab_mapping[""] + seq_mask = get_length_mask(token_ids, lengths) + logits = model(token_ids, graph_ids, mask=seq_mask, finetune=finetune) + loss = model.loss(logits, labels, mask=seq_mask, class_weights=class_weights, extra_mask=extra_mask) + p, r, f1 = model.score(logits, labels, mask=seq_mask, scorer=scorer, extra_mask=extra_mask) + optimizer.zero_grad() + loss.backward() + optimizer.step() + + return loss, p, r, f1 + + +def test_step( + model, token_ids, prefix, suffix, graph_ids, labels, lengths, extra_mask=None, class_weights=None, scorer=None, + vocab_mapping=None +): + with torch.no_grad(): + token_ids[token_ids == len(vocab_mapping)] = vocab_mapping[""] + seq_mask = get_length_mask(token_ids, lengths) + logits = model(token_ids, graph_ids, mask=seq_mask) + loss = model.loss(logits, labels, mask=seq_mask, class_weights=class_weights, extra_mask=extra_mask) + p, r, f1 = model.score(logits, labels, mask=seq_mask, scorer=scorer, extra_mask=extra_mask) + + return loss, p, r, f1 + + +class CodeBertModelTrainer2(CodeBertModelTrainer): + def __init__(self, *args, gpu_id=-1, **kwargs): + self.gpu_id = gpu_id + super().__init__(*args, **kwargs) + self.set_gpu() + + def get_dataloaders(self, word_emb, graph_emb, suffix_prefix_buckets): + decoder_mapping = RobertaTokenizer.from_pretrained("microsoft/codebert-base").decoder + tok_ids, words = zip(*decoder_mapping.items()) + self.vocab_mapping = dict(zip(words, tok_ids)) + + train_batcher = self.get_batcher( + self.train_data, self.batch_size, seq_len=self.seq_len, + graphmap=graph_emb.ind if graph_emb is not None else None, + wordmap=self.vocab_mapping, tagmap=None, + class_weights=False, element_hash_size=suffix_prefix_buckets, no_localization=self.no_localization + ) + test_batcher = self.get_batcher( + self.test_data, self.batch_size, seq_len=self.seq_len, + graphmap=graph_emb.ind if graph_emb is not None else None, + wordmap=self.vocab_mapping, + tagmap=train_batcher.tagmap, # use the same mapping + class_weights=False, element_hash_size=suffix_prefix_buckets, # class_weights are not used for testing + no_localization=self.no_localization + ) + return train_batcher, test_batcher + + def train( + self, model, train_batches, test_batches, epochs, report_every=10, scorer=None, learning_rate=0.01, + learning_rate_decay=1., finetune=False, summary_writer=None, save_ckpt_fn=None, no_localization=False + ): + + optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) + scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=learning_rate_decay) + + train_losses = [] + test_losses = [] + train_f1s = [] + test_f1s = [] + + num_train_batches = len(train_batches) + num_test_batches = len(test_batches) + + best_f1 = 0. + + try: + for e in range(epochs): + losses = [] + ps = [] + rs = [] + f1s = [] + + start = time() + model.train() + + for ind, batch in enumerate(tqdm(train_batches)): + batch_to_torch(batch, self.device) + # token_ids, graph_ids, labels, class_weights, lengths = b + loss, p, r, f1 = train_step_finetune( + model=model, optimizer=optimizer, token_ids=batch['tok_ids'], + prefix=batch['prefix'], suffix=batch['suffix'], graph_ids=batch['graph_ids'], + labels=batch['tags'], lengths=batch['lens'], + extra_mask=batch['no_loc_mask'] if no_localization else batch['hide_mask'], + # class_weights=batch['class_weights'], + scorer=scorer, finetune=finetune and e / epochs > 0.6, + vocab_mapping=self.vocab_mapping + ) + losses.append(loss.cpu().item()) + ps.append(p) + rs.append(r) + f1s.append(f1) + + self.summary_writer.add_scalar("Loss/Train", loss, global_step=e * num_train_batches + ind) + self.summary_writer.add_scalar("Precision/Train", p, global_step=e * num_train_batches + ind) + self.summary_writer.add_scalar("Recall/Train", r, global_step=e * num_train_batches + ind) + self.summary_writer.add_scalar("F1/Train", f1, global_step=e * num_train_batches + ind) + + test_alosses = [] + test_aps = [] + test_ars = [] + test_af1s = [] + + model.eval() + + for ind, batch in enumerate(test_batches): + batch_to_torch(batch, self.device) + # token_ids, graph_ids, labels, class_weights, lengths = b + test_loss, test_p, test_r, test_f1 = test_step( + model=model, token_ids=batch['tok_ids'], + prefix=batch['prefix'], suffix=batch['suffix'], graph_ids=batch['graph_ids'], + labels=batch['tags'], lengths=batch['lens'], + extra_mask=batch['no_loc_mask'] if no_localization else batch['hide_mask'], + # class_weights=batch['class_weights'], + scorer=scorer, vocab_mapping=self.vocab_mapping + ) + + self.summary_writer.add_scalar("Loss/Test", test_loss, global_step=e * num_test_batches + ind) + self.summary_writer.add_scalar("Precision/Test", test_p, global_step=e * num_test_batches + ind) + self.summary_writer.add_scalar("Recall/Test", test_r, global_step=e * num_test_batches + ind) + self.summary_writer.add_scalar("F1/Test", test_f1, global_step=e * num_test_batches + ind) + test_alosses.append(test_loss.cpu().item()) + test_aps.append(test_p) + test_ars.append(test_r) + test_af1s.append(test_f1) + + epoch_time = time() - start + + train_losses.append(float(sum(losses) / len(losses))) + train_f1s.append(float(sum(f1s) / len(f1s))) + test_losses.append(float(sum(test_alosses) / len(test_alosses))) + test_f1s.append(float(sum(test_af1s) / len(test_af1s))) + + print( + f"Epoch: {e}, {epoch_time: .2f} s, Train Loss: {train_losses[-1]: .4f}, Train P: {sum(ps) / len(ps): .4f}, Train R: {sum(rs) / len(rs): .4f}, Train F1: {sum(f1s) / len(f1s): .4f}, " + f"Test loss: {test_losses[-1]: .4f}, Test P: {sum(test_aps) / len(test_aps): .4f}, Test R: {sum(test_ars) / len(test_ars): .4f}, Test F1: {test_f1s[-1]: .4f}") + + if save_ckpt_fn is not None and float(test_f1s[-1]) > best_f1: + save_ckpt_fn() + best_f1 = float(test_f1s[-1]) + + scheduler.step(epoch=e) + + except KeyboardInterrupt: + pass + + return train_losses, train_f1s, test_losses, test_f1s + + def create_summary_writer(self, path): + self.summary_writer = SummaryWriter(path) + + def set_gpu(self): + # torch.device("cuda" if torch.cuda.is_available() else "cpu") + if self.gpu_id == -1: + self.use_cuda = False + self.device = "cpu" + else: + torch.cuda.set_device(self.gpu_id) + self.use_cuda = True + self.device = f"cuda:{self.gpu_id}" + + def train_model(self): + + print(f"\n\n{self.model_params}") + lr = self.model_params.pop("learning_rate") + lr_decay = self.model_params.pop("learning_rate_decay") + suffix_prefix_buckets = self.model_params.pop("suffix_prefix_buckets") + + graph_emb = load_pkl_emb(self.graph_emb_path) if self.graph_emb_path is not None else None + + train_batcher, test_batcher = self.get_dataloaders(None, graph_emb, suffix_prefix_buckets=suffix_prefix_buckets) + + codebert_model = RobertaModel.from_pretrained("microsoft/codebert-base") + model = CodebertHybridModel( + codebert_model, graph_emb.e, padding_idx=train_batcher.graphpad, num_classes=train_batcher.num_classes(), + no_graph=self.no_graph + ) + if self.use_cuda: + model.cuda() + + trial_dir = os.path.join(self.output_dir, "codebert_" + str(datetime.now())).replace(":", "-").replace(" ", "_") + os.mkdir(trial_dir) + self.create_summary_writer(trial_dir) + + def save_ckpt_fn(): + checkpoint_path = os.path.join(trial_dir, "checkpoint") + torch.save(model, open(checkpoint_path, 'wb')) + + train_losses, train_f1, test_losses, test_f1 = self.train( + model=model, train_batches=train_batcher, test_batches=test_batcher, + epochs=self.epochs, learning_rate=lr, + scorer=lambda pred, true: scorer(pred, true, train_batcher.tagmap, no_localization=self.no_localization), + learning_rate_decay=lr_decay, finetune=self.finetune, save_ckpt_fn=save_ckpt_fn, + no_localization=self.no_localization + ) + + # checkpoint_path = os.path.join(trial_dir, "checkpoint") + # model.save_weights(checkpoint_path) + + metadata = { + "train_losses": train_losses, + "train_f1": train_f1, + "test_losses": test_losses, + "test_f1": test_f1, + "learning_rate": lr, + "learning_rate_decay": lr_decay, + "epochs": self.epochs, + "suffix_prefix_buckets": suffix_prefix_buckets, + "seq_len": self.seq_len, + "batch_size": self.batch_size, + "no_localization": self.no_localization + } + + print("Maximum f1:", max(test_f1)) + + # write_config(trial_dir, params, extra_params={"suffix_prefix_buckets": suffix_prefix_buckets, "seq_len": seq_len}) + + metadata.update(self.model_params) + + with open(os.path.join(trial_dir, "params.json"), "w") as metadata_sink: + metadata_sink.write(json.dumps(metadata, indent=4)) + + pickle.dump(train_batcher.tagmap, open(os.path.join(trial_dir, "tag_types.pkl"), "wb")) + + # for ind, batch in enumerate(tqdm(batcher)): + # # token_ids, graph_ids, labels, class_weights, lengths = b + # token_ids = torch.LongTensor(batch["tok_ids"]) + # lens = torch.LongTensor(batch["lens"]) + # + # token_ids[token_ids == len(vocab_mapping)] = vocab_mapping[""] + # + # def get_length_mask(target, lens): + # mask = torch.arange(target.size(1)).to(target.device)[None, :] < lens[:, None] + # return mask + # + # mask = get_length_mask(token_ids, lens) + # with torch.no_grad(): + # embs = model(input_ids=token_ids, attention_mask=mask) + # + # for s_emb, s_repl in zip(embs.last_hidden_state, batch["replacements"]): + # unique_repls = set(list(s_repl)) + # repls_for_ann = [r for r in unique_repls if r in typed_nodes] + # + # for r in repls_for_ann: + # position = s_repl.index(r) + # if position > 512: + # continue + # node_ids.append(r) + # embeddings.append(s_emb[position]) + # + # all_embs = torch.stack(embeddings, dim=0).numpy() + # embedder = Embedder(dict(zip(node_ids, range(len(node_ids)))), all_embs) + # pickle.dump(embedder, open("codebert_embeddings.pkl", "wb"), fix_imports=False) + # print(node_ids) + + +def main(): + args = get_type_prediction_arguments() + + # allowed = {'str', 'bool', 'Optional', 'None', 'int', 'Any', 'Union', 'List', 'Dict', 'Callable', 'ndarray', + # 'FrameOrSeries', 'bytes', 'DataFrame', 'Matcher', 'float', 'Tuple', 'bool_t', 'Description', 'Type'} + if args.restrict_allowed: + allowed = { + 'str', 'Optional', 'int', 'Any', 'Union', 'bool', 'Callable', 'Dict', 'bytes', 'float', 'Description', + 'List', 'Sequence', 'Namespace', 'T', 'Type', 'object', 'HTTPServerRequest', 'Future', "Matcher" + } + else: + allowed = None + + # train_data, test_data = read_data( + # open(args.data_path, "r").readlines(), normalize=True, allowed=None, include_replacements=True, + # include_only="entities", + # min_entity_count=args.min_entity_count, random_seed=args.random_seed + # ) + + from pathlib import Path + dataset_dir = Path(args.data_path).parent + train_data = filter_labels( + pickle.load(open(dataset_dir.joinpath("type_prediction_dataset_no_defaults_train.pkl"), "rb")), + allowed=allowed + ) + test_data = filter_labels( + pickle.load(open(dataset_dir.joinpath("type_prediction_dataset_no_defaults_test.pkl"), "rb")), + allowed=allowed + ) + + trainer = CodeBertModelTrainer2( + train_data, test_data, params={"learning_rate": 1e-4, "learning_rate_decay": 0.99, "suffix_prefix_buckets": 1}, + graph_emb_path=args.graph_emb_path, word_emb_path=args.word_emb_path, + output_dir=args.model_output, epochs=args.epochs, batch_size=args.batch_size, gpu_id=args.gpu, + finetune=args.finetune, trials=args.trials, seq_len=args.max_seq_len, no_localization=args.no_localization, + no_graph=args.no_graph + ) + trainer.set_type_ann_edges(args.type_ann_edges) + trainer.train_model() + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/SourceCodeTools/nlp/codebert/deprecated/codebert_training.py b/SourceCodeTools/nlp/codebert/deprecated/codebert_training.py new file mode 100644 index 00000000..5f1ebd11 --- /dev/null +++ b/SourceCodeTools/nlp/codebert/deprecated/codebert_training.py @@ -0,0 +1,101 @@ +import pickle + +import torch +from tqdm import tqdm +from transformers import RobertaTokenizer, RobertaModel + +from SourceCodeTools.models.Embedder import Embedder +from SourceCodeTools.nlp.codebert.codebert import CodeBertModelTrainer, load_typed_nodes +from SourceCodeTools.nlp.entity.type_prediction import get_type_prediction_arguments, ModelTrainer +from SourceCodeTools.nlp.entity.utils.data import read_data + + +class CodeBertModelTrainer2(CodeBertModelTrainer): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def set_type_ann_edges(self, path): + self.type_ann_edges = path + + def get_batcher(self, *args, **kwargs): + kwargs.update({"tokenizer": "codebert"}) + return self.batcher(*args, **kwargs) + + def train_model(self): + # graph_emb = load_pkl_emb(self.graph_emb_path) if self.graph_emb_path is not None else None + + typed_nodes = load_typed_nodes(self.type_ann_edges) + + decoder_mapping = RobertaTokenizer.from_pretrained("microsoft/codebert-base").decoder + tok_ids, words = zip(*decoder_mapping.items()) + vocab_mapping = dict(zip(words, tok_ids)) + batcher = self.get_batcher( + self.train_data + self.test_data, self.batch_size, seq_len=self.seq_len, + graphmap=None, + wordmap=vocab_mapping, tagmap=None, + class_weights=False, element_hash_size=1 + ) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model = RobertaModel.from_pretrained("microsoft/codebert-base") + model.to(device) + + node_ids = [] + embeddings = [] + + for ind, batch in enumerate(tqdm(batcher)): + # token_ids, graph_ids, labels, class_weights, lengths = b + token_ids = torch.LongTensor(batch["tok_ids"]) + lens = torch.LongTensor(batch["lens"]) + + token_ids[token_ids == len(vocab_mapping)] = vocab_mapping[""] + + def get_length_mask(target, lens): + mask = torch.arange(target.size(1)).to(target.device)[None, :] < lens[:, None] + return mask + + mask = get_length_mask(token_ids, lens) + with torch.no_grad(): + embs = model(input_ids=token_ids, attention_mask=mask) + + for s_emb, s_repl in zip(embs.last_hidden_state, batch["replacements"]): + unique_repls = set(list(s_repl)) + repls_for_ann = [r for r in unique_repls if r in typed_nodes] + + for r in repls_for_ann: + position = s_repl.index(r) + if position > 512: + continue + node_ids.append(r) + embeddings.append(s_emb[position]) + + all_embs = torch.stack(embeddings, dim=0).numpy() + embedder = Embedder(dict(zip(node_ids, range(len(node_ids)))), all_embs) + pickle.dump(embedder, open("codebert_embeddings.pkl", "wb"), fix_imports=False) + print(node_ids) + + +def main(): + # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + # tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base") + model = RobertaModel.from_pretrained("microsoft/codebert-base") + # model.to(device) + args = get_type_prediction_arguments() + + # allowed = {'str', 'bool', 'Optional', 'None', 'int', 'Any', 'Union', 'List', 'Dict', 'Callable', 'ndarray', + # 'FrameOrSeries', 'bytes', 'DataFrame', 'Matcher', 'float', 'Tuple', 'bool_t', 'Description', 'Type'} + + train_data, test_data = read_data( + open(args.data_path, "r").readlines(), normalize=True, allowed=None, include_replacements=True, + include_only="entities", + min_entity_count=args.min_entity_count, random_seed=args.random_seed + ) + + trainer = CodeBertModelTrainer(train_data, test_data, params={}, seq_len=512) + trainer.set_type_ann_edges(args.type_ann_edges) + trainer.train_model() + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/SourceCodeTools/nlp/codebert/fasttext_extract.py b/SourceCodeTools/nlp/codebert/fasttext_extract.py new file mode 100644 index 00000000..7d635664 --- /dev/null +++ b/SourceCodeTools/nlp/codebert/fasttext_extract.py @@ -0,0 +1,98 @@ +import pickle + +import gensim +import numpy as np +import torch +from tqdm import tqdm + +from SourceCodeTools.models.Embedder import Embedder +from SourceCodeTools.nlp.entity.type_prediction import get_type_prediction_arguments, ModelTrainer, load_pkl_emb +from SourceCodeTools.nlp.entity.utils.data import read_data + + +def load_typed_nodes(path): + from SourceCodeTools.code.data.file_utils import unpersist + type_ann = unpersist(path) + + filter_rule = lambda name: "0x" not in name + + type_ann = type_ann[ + type_ann["dst"].apply(filter_rule) + ] + + typed_nodes = set(type_ann["src"].tolist()) + return typed_nodes + + +class FasttextModelTrainer(ModelTrainer): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def set_type_ann_edges(self, path): + self.type_ann_edges = path + + def get_batcher(self, *args, **kwargs): + # kwargs.update({"tokenizer": "codebert"}) + return self.batcher(*args, **kwargs) + + def train_model(self): + # graph_emb = load_pkl_emb(self.graph_emb_path) if self.graph_emb_path is not None else None + word_emb = load_pkl_emb(self.word_emb_path) + + fasttext = gensim.models.FastText.load("/Users/LTV/Downloads/NitroShare/py_150_d500/model") + + typed_nodes = load_typed_nodes(self.type_ann_edges) + vocab_mapping = word_emb.ind + + batcher = self.get_batcher( + self.train_data + self.test_data, self.batch_size, seq_len=self.seq_len, + graphmap=None, + wordmap=vocab_mapping, tagmap=None, + class_weights=False, element_hash_size=1 + ) + + node_ids = [] + embeddings = [] + + added = set() + + for ind, batch in enumerate(tqdm(batcher)): + for s_toks, s_repl in zip(batch["toks"], batch["replacements"]): + unique_repls = set(list(s_repl)) + repls_for_ann = [r for r in unique_repls if r in typed_nodes] + + for r in repls_for_ann: + position = s_repl.index(r) + if position > 512: + continue + if r not in added: + node_ids.append(r) + embeddings.append(fasttext.wv[s_toks[position]]) + added.add(r) + + all_embs = np.vstack(embeddings) + embedder = Embedder(dict(zip(node_ids, range(len(node_ids)))), all_embs) + pickle.dump(embedder, open("/Users/LTV/Downloads/NitroShare/fasttext_embeddings_500_2.pkl", "wb"), fix_imports=False) + print(node_ids) + + +def main(): + args = get_type_prediction_arguments() + + # allowed = {'str', 'bool', 'Optional', 'None', 'int', 'Any', 'Union', 'List', 'Dict', 'Callable', 'ndarray', + # 'FrameOrSeries', 'bytes', 'DataFrame', 'Matcher', 'float', 'Tuple', 'bool_t', 'Description', 'Type'} + + train_data, test_data = read_data( + open(args.data_path, "r").readlines(), normalize=True, allowed=None, include_replacements=True, + include_only="entities", + min_entity_count=args.min_entity_count, random_seed=args.random_seed + ) + + trainer = FasttextModelTrainer(train_data, test_data, params={}, seq_len=512, word_emb_path=args.word_emb_path) + trainer.set_type_ann_edges(args.type_ann_edges) + trainer.train_model() + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/SourceCodeTools/nlp/embed/lda.py b/SourceCodeTools/nlp/embed/lda.py index fddbeafa..76f229ae 100644 --- a/SourceCodeTools/nlp/embed/lda.py +++ b/SourceCodeTools/nlp/embed/lda.py @@ -11,7 +11,7 @@ def read_corpus(path, data_field): if path.endswith("bz2") or path.endswith("parquet") or path.endswith("csv"): - from SourceCodeTools.code.data.sourcetrail.file_utils import unpersist + from SourceCodeTools.code.data.file_utils import unpersist data = unpersist(path)[data_field].tolist() elif path.endswith("jsonl"): import json diff --git a/SourceCodeTools/nlp/entity/apply_model.py b/SourceCodeTools/nlp/entity/apply_model.py index d6d75e68..9ddd64bd 100644 --- a/SourceCodeTools/nlp/entity/apply_model.py +++ b/SourceCodeTools/nlp/entity/apply_model.py @@ -4,11 +4,17 @@ import os import pickle +import numpy as np +from tqdm import tqdm + +os.environ['CUDA_VISIBLE_DEVICES'] = '-1' +os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' + import tensorflow as tf from SourceCodeTools.nlp.entity import parse_biluo from SourceCodeTools.nlp.entity.tf_models.tf_model import TypePredictor -from SourceCodeTools.nlp.entity.type_prediction import PythonBatcher, load_pkl_emb +from SourceCodeTools.nlp.entity.type_prediction import PythonBatcher, load_pkl_emb, span_f1 from SourceCodeTools.nlp.entity.utils.data import read_data @@ -42,14 +48,14 @@ def predict_one(model, input, tagmap): return logits_to_annotations( tf.math.argmax(model( token_ids=input['tok_ids'], prefix_ids=input['prefix'], suffix_ids=input['suffix'], - graph_ids=input['graph_ids'], training=False + graph_ids=input['graph_ids'], training=False, mask=tf.sequence_mask(input["lens"], input["tok_ids"].shape[1]) ), axis=-1), input['lens'], tagmap ) -def apply_to_dataset(data, Batcher, Model, graph_emb_path=None, word_emb_path=None, checkpoint_path=None, batch_size=1): +def apply_to_dataset(data, Batcher, Model, graph_emb_path=None, word_emb_path=None, checkpoint_path=None): graph_emb = load_pkl_emb(graph_emb_path) word_emb = load_pkl_emb(word_emb_path) @@ -61,10 +67,11 @@ def apply_to_dataset(data, Batcher, Model, graph_emb_path=None, word_emb_path=No seq_len = params.pop("seq_len") suffix_prefix_buckets = params.pop("suffix_prefix_buckets") + batch_size = params.pop("batch_size") data_batcher = Batcher( data, batch_size, seq_len=seq_len, wordmap=word_emb.ind, graphmap=graph_emb.ind, tagmap=tagmap, - mask_unlabeled_declarations=True, class_weights=False, element_hash_size=suffix_prefix_buckets + mask_unlabeled_declarations=True, class_weights=False, element_hash_size=suffix_prefix_buckets, len_sort=True ) params.pop("train_losses") @@ -76,26 +83,96 @@ def apply_to_dataset(data, Batcher, Model, graph_emb_path=None, word_emb_path=No params.pop("learning_rate_decay") model = Model( - word_emb, graph_emb, train_embeddings=False, num_classes=len(tagmap), seq_len=seq_len, **params + word_emb, graph_emb, train_embeddings=False, num_classes=len(tagmap), seq_len=seq_len, + suffix_prefix_buckets=suffix_prefix_buckets, **params ) model.load_weights(os.path.join(checkpoint_path, "checkpoint")) all_true = [] all_estimated = [] + true_scoring = [] + pred_scoring = [] - for batch in data_batcher: + for batch_ind, batch in enumerate(data_batcher): true_annotations = logits_to_annotations(batch['tags'], batch['lens'], tagmap) est_annotations = predict_one(model, batch, tagmap) all_true.extend(true_annotations) all_estimated.extend(est_annotations) + for s_ind in range(len(batch["lens"])): + for ent in true_annotations[s_ind]: + true_scoring.append((batch_ind, s_ind, ent)) + for ent in est_annotations[s_ind]: + pred_scoring.append((batch_ind, s_ind, ent)) + + precision, recall, f1 = span_f1(set(pred_scoring), set(true_scoring)) + + with open(os.path.join(args.checkpoint_path, "scores.txt"), "w") as scores_sink: + scores_str = f"Precision: {precision: .2f}, Recall: {recall: .2f}, f1: {f1: .2f}" + print(scores_str) + scores_sink.write(f"{scores_str}\n") + from SourceCodeTools.nlp.entity.entity_render import render_annotations - html = render_annotations(zip(data, all_estimated, all_true)) - with open("render.html", "w") as render: + html = render_annotations(zip(data_batcher.data, all_estimated, all_true)) + with open(os.path.join(args.checkpoint_path, "render.html"), "w") as render: render.write(html) + estimate_confusion(all_estimated, all_true) + + +def estimate_confusion(pred, true): + pred_filtered = [] + true_filtered = [] + for p, t in zip(pred, true): + for e in p: + e_span = e[:2] + for e_t in t: + t_span = e_t[:2] + if e_span == t_span: + pred_filtered.append(e[2]) + true_filtered.append(e_t[2]) + break + + labels = sorted(list(set(true_filtered + pred_filtered))) + label2ind = dict(zip(labels, range(len(labels)))) + + confusion = np.zeros((len(labels), len(labels))) + + for pred, true in zip(pred_filtered, true_filtered): + confusion[label2ind[true], label2ind[pred]] += 1 + + norm = np.array([x if x != 0 else 1. for x in np.sum(confusion, axis=1)]).reshape(-1,1) + confusion /= norm + + import matplotlib.pyplot as plt + + fig, ax = plt.subplots(figsize=(45,45)) + im = ax.imshow(confusion) + + # We want to show all ticks... + ax.set_xticks(np.arange(len(labels))) + ax.set_yticks(np.arange(len(labels))) + # ... and label them with the respective list entries + ax.set_xticklabels(labels) + ax.set_yticklabels(labels) + + # Rotate the tick labels and set their alignment. + plt.setp(ax.get_xticklabels(), rotation=45, ha="right", + rotation_mode="anchor") + + # Loop over data dimensions and create text annotations. + for i in range(len(labels)): + for j in range(len(labels)): + text = ax.text(j, i, f"{confusion[i, j]: .2f}", + ha="center", va="center", color="w") + + ax.set_title("Confusion matrix for Python type prediction") + fig.tight_layout() + # plt.show() + plt.savefig(os.path.join(args.checkpoint_path, "confusion.png")) + if __name__ == "__main__": import argparse @@ -108,16 +185,18 @@ def apply_to_dataset(data, Batcher, Model, graph_emb_path=None, word_emb_path=No parser.add_argument('--word_emb_path', dest='word_emb_path', default=None, help='Path to the file with edges') parser.add_argument('checkpoint_path', default=None, help='') + parser.add_argument('--random_seed', dest='random_seed', default=None, type=int, + help='') args = parser.parse_args() train_data, test_data = read_data( open(args.data_path, "r").readlines(), normalize=True, allowed=None, include_replacements=True, - include_only="entities", - min_entity_count=5 + include_only="entities", random_seed=args.random_seed, + min_entity_count=3 ) apply_to_dataset( test_data, PythonBatcher, TypePredictor, graph_emb_path=args.graph_emb_path, word_emb_path=args.word_emb_path, - checkpoint_path=args.checkpoint_path, batch_size=args.batch_size + checkpoint_path=args.checkpoint_path ) diff --git a/SourceCodeTools/nlp/entity/entity_render.py b/SourceCodeTools/nlp/entity/entity_render.py index 9b708142..8dcf73ed 100644 --- a/SourceCodeTools/nlp/entity/entity_render.py +++ b/SourceCodeTools/nlp/entity/entity_render.py @@ -1,5 +1,5 @@ import spacy -from SourceCodeTools.nlp.entity.util import inject_tokenizer +from SourceCodeTools.nlp import create_tokenizer html_template = """ @@ -57,7 +57,7 @@ def annotate(doc, entities): def render_annotations(annotations): - nlp = inject_tokenizer(spacy.blank("en")) + nlp = create_tokenizer("spacy") entries = "" for annotation in annotations: text, predicted, annotated = annotation diff --git a/SourceCodeTools/nlp/entity/map_args_to_mentions.py b/SourceCodeTools/nlp/entity/map_args_to_mentions.py new file mode 100644 index 00000000..087c1970 --- /dev/null +++ b/SourceCodeTools/nlp/entity/map_args_to_mentions.py @@ -0,0 +1,48 @@ +import argparse +import json +from os.path import join + +from SourceCodeTools.code.data.dataset.Dataset import load_data +from SourceCodeTools.code.data.file_utils import unpersist + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("working_directory") + parser.add_argument("output") + args = parser.parse_args() + + nodes, edges = load_data(join(args.working_directory, "nodes.bz2"), join(args.working_directory, "edges.bz2")) + type_annotated = set(unpersist(join(args.working_directory, "type_annotations.bz2"))["src"].tolist()) + arguments = set(nodes.query("type == 'arg'")["id"].tolist()) + mentions = set(nodes.query("type == 'mention'")["id"].tolist()) + + edges["in_mentions"] = edges["src"].apply(lambda src: src in mentions) + + edges["in_args"] = edges["dst"].apply(lambda dst: dst in arguments) + + edges = edges.query("in_mentions == True and in_args == True") + + mapping = {} + for src, dst in edges[["src", "dst"]].values: + if dst in mapping: + print() + mapping[dst] = src + + with open(args.output, "w") as sink: + with open(join(args.working_directory, "function_annotations.jsonl")) as fa: + for line in fa: + entry = json.loads(line) + new_repl = [[s, e, int(mapping.get(r, r))] for s, e, r in entry["replacements"]] + entry["replacements"] = new_repl + + sink.write(f"{json.dumps(entry)}\n") + + + print() + + # pickle.dump(mapping, open(args.output, "wb")) + + +if __name__ == "__main__": + main() diff --git a/SourceCodeTools/nlp/entity/split_dataset.py b/SourceCodeTools/nlp/entity/split_dataset.py new file mode 100644 index 00000000..2aa75596 --- /dev/null +++ b/SourceCodeTools/nlp/entity/split_dataset.py @@ -0,0 +1,34 @@ +import pickle +from collections import Counter + +from SourceCodeTools.nlp.entity.type_prediction import get_type_prediction_arguments +from SourceCodeTools.nlp.entity.utils.data import read_data + + +def get_all_annotations(dataset): + ann = [] + for _, annotations in dataset: + for _, _, e in annotations["entities"]: + ann.append(e) + return ann + + +if __name__ == "__main__": + + args = get_type_prediction_arguments() + + train_data, test_data = read_data( + open(args.data_path, "r").readlines(), normalize=True, allowed=None, include_replacements=True, include_only="entities", + min_entity_count=args.min_entity_count, random_seed=args.random_seed + ) + + pickle.dump(train_data, open("type_prediction_dataset_no_defaults_train.pkl", "wb")) + pickle.dump(test_data, open("type_prediction_dataset_no_defaults_test.pkl", "wb")) + + ent_counts = Counter(get_all_annotations(train_data)) | Counter(get_all_annotations(test_data)) + + with open("type_prediction_dataset_argument_annotations_counts.txt", "w") as sink: + for ent, count in ent_counts.most_common(): + sink.write(f"{ent}\t{count}\n") + + print() \ No newline at end of file diff --git a/SourceCodeTools/nlp/entity/tf_models/params.py b/SourceCodeTools/nlp/entity/tf_models/params.py index f8560eeb..341e578f 100644 --- a/SourceCodeTools/nlp/entity/tf_models/params.py +++ b/SourceCodeTools/nlp/entity/tf_models/params.py @@ -20,22 +20,30 @@ # "suffix_prefix_dims": 20, # "suffix_prefix_buckets": 1000, # }, - { - "h_sizes": [40, 40, 40], - "dense_size": 30, - "pos_emb_size": 30, - "cnn_win_size": 5, - "suffix_prefix_dims": 50, - "suffix_prefix_buckets": 2000, - }, - # { + # { + # "h_sizes": [40, 40, 40], + # "dense_size": 30, + # "pos_emb_size": 30, + # "cnn_win_size": 5, + # "suffix_prefix_dims": 50, + # "suffix_prefix_buckets": 2000, + # }, + # { # "h_sizes": [80, 80, 80], # "dense_size": 40, # "pos_emb_size": 50, # "cnn_win_size": 7, # "suffix_prefix_dims": 70, # "suffix_prefix_buckets": 3000, - # } + # }, + { + "h_sizes": [100, 100, 100], + "dense_size": 60, + "pos_emb_size": 50, + "cnn_win_size": 7, + "suffix_prefix_dims": 70, + "suffix_prefix_buckets": 3000, + } ], "learning_rate": [0.0001], "learning_rate_decay": [0.998] # 0.991 @@ -115,4 +123,4 @@ def flatten(params): att_params = list( map(flatten, ParameterGrid(att_params_grids)) -) \ No newline at end of file +) diff --git a/SourceCodeTools/nlp/entity/tf_models/tf_model.py b/SourceCodeTools/nlp/entity/tf_models/tf_model.py index 4a1ba2b9..bba7bad2 100644 --- a/SourceCodeTools/nlp/entity/tf_models/tf_model.py +++ b/SourceCodeTools/nlp/entity/tf_models/tf_model.py @@ -1,17 +1,21 @@ # import tensorflow as tf # import sys # from gensim.models import Word2Vec +import logging +from time import time + import numpy as np # from collections import Counter from scipy.linalg import toeplitz # from gensim.models import KeyedVectors -from copy import copy +from copy import copy, deepcopy import tensorflow as tf import tensorflow_addons as tfa from tensorflow.keras.layers import Dense, Conv2D, Flatten, Input, Embedding, concatenate from tensorflow.keras import Model +from tensorflow.keras.layers import Layer # from tensorflow.keras import regularizers # from spacy.gold import offsets_from_biluo_tags @@ -21,194 +25,45 @@ # https://github.com/dhiraa/tener/tree/master/src/tener/models # https://arxiv.org/pdf/1903.07785v1.pdf # https://github.com/tensorflow/models/tree/master/research/cvt_text/model - - -class DefaultEmbedding(Model): - """ - Creates an embedder that provides the default value for the index -1. The default value is a zero-vector - """ - def __init__(self, init_vectors=None, shape=None, trainable=True): - super(DefaultEmbedding, self).__init__() - - if init_vectors is not None: - self.embs = tf.Variable(init_vectors, dtype=tf.float32, - trainable=trainable, name="default_embedder_var") - shape = init_vectors.shape - else: - # TODO - # the default value is no longer constant. need to replace this with a standard embedder - self.embs = tf.Variable(tf.random.uniform(shape=(shape[0], shape[1]), dtype=tf.float32), - name="default_embedder_pad") - # self.pad = tf.zeros(shape=(1, init_vectors.shape[1]), name="default_embedder_pad") - # self.pad = tf.random.uniform(shape=(1, init_vectors.shape[1]), name="default_embedder_pad") - self.pad = tf.Variable(tf.random.uniform(shape=(1, shape[1]), dtype=tf.float32), - name="default_embedder_pad") - - - def __call__(self, ids): - emb_matr = tf.concat([self.embs, self.pad], axis=0) - return tf.nn.embedding_lookup(params=emb_matr, ids=ids) - # return tf.expand_dims(tf.nn.embedding_lookup(params=self.emb_matr, ids=ids), axis=3) - - -class PositionalEncoding(Model): - def __init__(self, seq_len, pos_emb_size): - """ - Create positional embedding matrix for tokens. Currently not using because it results in N^2 computational - complexity. Should move this functionality to batch preparation. - :param seq_len: maximum sequence length - :param pos_emb_size: the dimensionality of positional embeddings - """ - super(PositionalEncoding, self).__init__() - - positions = list(range(seq_len * 2)) - position_splt = positions[:seq_len] - position_splt.reverse() - self.position_encoding = tf.constant(toeplitz(position_splt, positions[seq_len:]), - dtype=tf.int32, - name="position_encoding") - # self.position_embedding = tf.random.uniform(name="position_embedding", shape=(seq_len * 2, pos_emb_size), dtype=tf.float32) - self.position_embedding = tf.Variable(tf.random.uniform(shape=(seq_len * 2, pos_emb_size), dtype=tf.float32), - name="position_embedding") - # self.position_embedding = tf.Variable(name="position_embedding", shape=(seq_len * 2, pos_emb_size), dtype=tf.float32) - - def __call__(self): - # return tf.nn.embedding_lookup(self.position_embedding, self.position_encoding, name="position_lookup") - return tf.nn.embedding_lookup(self.position_embedding, self.position_encoding, name="position_lookup") - - -class TextCnnLayer(Model): - """ - - """ - def __init__(self, out_dim, kernel_shape, activation=None): - super(TextCnnLayer, self).__init__() - - self.kernel_shape = kernel_shape - self.out_dim = out_dim - - self.textConv = Conv2D(filters=out_dim, kernel_size=kernel_shape, - activation=activation, data_format='channels_last') - - padding_size = (self.kernel_shape[0] - 1) // 2 - assert padding_size * 2 + 1 == self.kernel_shape[0] - self.pad_constant = tf.constant([[0, 0], [padding_size, padding_size], [0, 0], [0, 0]]) - - def __call__(self, x): - padded = tf.pad(x, self.pad_constant) - # emb_sent_exp = tf.expand_dims(input, axis=3) - convolve = self.textConv(padded) - return tf.squeeze(convolve, axis=-2) - - -class TextCnn(Model): - """ - TextCnn model for classifying tokens in a sequence. The model uses following pipeline: - - token_embeddings (provided from outside) -> - several convolutional layers, get representations for all tokens -> - pass representation for all tokens through a dense network -> - classify each token - """ - def __init__(self, input_size, h_sizes, seq_len, - pos_emb_size, cnn_win_size, dense_size, num_classes, - activation=None, dense_activation=None, drop_rate=0.2): - """ - - :param input_size: dimensionality of input embeddings - :param h_sizes: sizes of hidden CNN layers, internal dimensionality of token embeddings - :param seq_len: maximum sequence length - :param pos_emb_size: dimensionality of positional embeddings - :param cnn_win_size: width of cnn window - :param dense_size: number of unius in dense network - :param num_classes: number of output units - :param activation: activation for cnn - :param dense_activation: activation for dense layers - :param drop_rate: dropout rate for dense network - """ - super(TextCnn, self).__init__() - - self.seq_len = seq_len - self.h_sizes = h_sizes - self.dense_size = dense_size - self.num_classes = num_classes - - def infer_kernel_sizes(h_sizes): - """ - Compute kernel sizes from the desired dimensionality of hidden layers - :param h_sizes: - :return: - """ - kernel_sizes = copy(h_sizes) - kernel_sizes.pop(-1) # pop last because it is the output of the last CNN layer - kernel_sizes.insert(0, input_size) # the first kernel size should be (cnn_win_size, input_size) - kernel_sizes = [(cnn_win_size, ks) for ks in kernel_sizes] - return kernel_sizes - - kernel_sizes = infer_kernel_sizes(h_sizes) - - self.layers_tok = [ TextCnnLayer(out_dim=h_size, kernel_shape=kernel_size, activation=activation) - for h_size, kernel_size in zip(h_sizes, kernel_sizes)] - - # self.layers_pos = [TextCnnLayer(out_dim=h_size, kernel_shape=(cnn_win_size, pos_emb_size), activation=activation) - # for h_size, _ in zip(h_sizes, kernel_sizes)] - - # self.positional = PositionalEncoding(seq_len=seq_len, pos_emb_size=pos_emb_size) - - if dense_activation is None: - dense_activation = activation - - # self.attention = tfa.layers.MultiHeadAttention(head_size=200, num_heads=1) - - self.dense_1 = Dense(dense_size, activation=dense_activation) - self.dropout_1 = tf.keras.layers.Dropout(rate=drop_rate) - self.dense_2 = Dense(num_classes, activation=None) # logits - self.dropout_2 = tf.keras.layers.Dropout(rate=drop_rate) - - def __call__(self, embs, training=True): - - temp_cnn_emb = embs # shape (?, seq_len, input_size) - - # pass embeddings through several CNN layers - for l in self.layers_tok: - temp_cnn_emb = l(tf.expand_dims(temp_cnn_emb, axis=3)) # shape (?, seq_len, h_size) - - # TODO - # simplify to one CNN and one attention - - # pos_cnn = self.positional() - # for l in self.layers_pos: - # pos_cnn = l(tf.expand_dims(pos_cnn, axis=3)) - # - # cnn_pool_feat = [] - # for i in range(self.seq_len): - # # slice tensor for the line that corresponds to the current position in the sentence - # position_features = tf.expand_dims(pos_cnn[i, ...], axis=0, name="exp_dim_%d" % i) - # # convolution without activation can be combined later, hence: temp_cnn_emb + position_features - # cnn_pool_feat.append( - # tf.expand_dims(tf.nn.tanh(tf.reduce_max(temp_cnn_emb + position_features, axis=1)), axis=1)) - # # cnn_pool_feat.append( - # # tf.expand_dims(tf.nn.tanh(tf.reduce_max(tf.concat([temp_cnn_emb, position_features], axis=-1), axis=1)), axis=1)) - # - # cnn_pool_features = tf.concat(cnn_pool_feat, axis=1) - cnn_pool_features = temp_cnn_emb - - # cnn_pool_features = self.attention([cnn_pool_features, cnn_pool_features]) - - # token_features = self.dropout_1( - # tf.reshape(cnn_pool_features, shape=(-1, self.h_sizes[-1])) - # , training=training) - - # reshape before passing through a dense network - token_features = tf.reshape(cnn_pool_features, shape=(-1, self.h_sizes[-1])) # shape (? * seq_len, h_size[-1]) - - # local_h2 = self.dropout_2( - # self.dense_1(token_features) - # , training=training) - local_h2 = self.dense_1(token_features) # shape (? * seq_len, dense_size) - tag_logits = self.dense_2(local_h2) # shape (? * seq_len, num_classes) - - return tf.reshape(tag_logits, (-1, self.seq_len, self.num_classes)) # reshape back, shape (?, seq_len, num_classes) +from tensorflow_addons.layers import MultiHeadAttention +from tqdm import tqdm + +from SourceCodeTools.models.nlp.TFDecoder import ConditionalAttentionDecoder, FlatDecoder +from SourceCodeTools.models.nlp.TFEncoder import DefaultEmbedding, TextCnnEncoder, GRUEncoder + + +class T5Encoder(Model): + def __init__(self): + super(T5Encoder, self).__init__() + from transformers.models.t5.modeling_tf_t5 import T5Config, TFT5MainLayer + self.adapter = Dense(40, activation=tf.nn.relu) + self.t5_config = T5Config(d_model=40, d_ff=40, num_layers=1, num_decoder_layers=1, num_heads=1, is_encoder_decoder=False,d_kv=40) + self.encoder = TFT5MainLayer(self.t5_config,) + + def call(self, embs, training=True, mask=None): + from transformers.modeling_tf_utils import input_processing + inputs = input_processing( + func=self.call, + config=self.t5_config, + input_ids=None, + attention_mask=None, + decoder_input_ids=None, + decoder_attention_mask=None, + head_mask=None, + decoder_head_mask=None, + encoder_outputs=None, + past_key_values=None, + inputs_embeds=self.adapter(embs), + decoder_inputs_embeds=None, + use_cache=False, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + training=training, + kwargs_call={}, + ) + encoded = self.encoder(inputs["inputs"], inputs_embeds=inputs["inputs_embeds"], training=training) + return encoded["last_hidden_state"] class TypePredictor(Model): @@ -222,9 +77,10 @@ class TypePredictor(Model): def __init__(self, tok_embedder, graph_embedder, train_embeddings=False, h_sizes=None, dense_size=100, num_classes=None, seq_len=100, pos_emb_size=30, cnn_win_size=3, - crf_transitions=None, suffix_prefix_dims=50, suffix_prefix_buckets=1000): + crf_transitions=None, suffix_prefix_dims=50, suffix_prefix_buckets=1000, + no_graph=False): """ - Initialize TypePredictor. Model initializes embedding layers and then passes embeddings to TextCnn model + Initialize TypePredictor. Model initializes embedding layers and then passes embeddings to TextCnnEncoder model :param tok_embedder: Embedder for tokens :param graph_embedder: Embedder for graph nodes :param train_embeddings: whether to finetune embeddings @@ -251,35 +107,43 @@ def __init__(self, tok_embedder, graph_embedder, train_embeddings=False, # initialize embeddings with tf.device('/CPU:0'): self.tok_emb = DefaultEmbedding(init_vectors=tok_embedder.e, trainable=train_embeddings) - self.graph_emb = DefaultEmbedding(init_vectors=graph_embedder.e, trainable=train_embeddings) + logging.warning("Not finetuning graph embeddings") + self.graph_emb = DefaultEmbedding(init_vectors=graph_embedder.e, trainable=False) # train_embeddings) self.prefix_emb = DefaultEmbedding(shape=(suffix_prefix_buckets, suffix_prefix_dims)) self.suffix_emb = DefaultEmbedding(shape=(suffix_prefix_buckets, suffix_prefix_dims)) - # self.tok_emb = Embedding(input_dim=tok_embedder.e.shape[0], - # output_dim=tok_embedder.e.shape[1], - # weights=tok_embedder.e, trainable=train_embeddings, - # mask_zero=True) - # - # self.graph_emb = Embedding(input_dim=graph_embedder.e.shape[0], - # output_dim=graph_embedder.e.shape[1], - # weights=graph_embedder.e, trainable=train_embeddings, - # mask_zero=True) - # compute final embedding size after concatenation - input_dim = tok_embedder.e.shape[1] + suffix_prefix_dims * 2 + graph_embedder.e.shape[1] - - self.text_cnn = TextCnn(input_size=input_dim, h_sizes=h_sizes, - seq_len=seq_len, pos_emb_size=pos_emb_size, - cnn_win_size=cnn_win_size, dense_size=dense_size, - num_classes=num_classes, activation=tf.nn.relu, - dense_activation=tf.nn.tanh) + input_dim = tok_embedder.e.shape[1] + suffix_prefix_dims * 2 #+ graph_embedder.e.shape[1] + + if cnn_win_size % 2 == 0: + cnn_win_size += 1 + logging.info(f"Window size should be odd. Setting to {cnn_win_size}") + + self.encoder = TextCnnEncoder( + # input_size=input_dim, + h_sizes=h_sizes, + seq_len=seq_len, pos_emb_size=pos_emb_size, + cnn_win_size=cnn_win_size, dense_size=dense_size, + out_dim=input_dim, activation=tf.nn.relu, + dense_activation=tf.nn.tanh) + # self.encoder = GRUEncoder(input_dim=input_dim, out_dim=input_dim, num_layers=1, dropout=0.1) + # self.encoder = T5Encoder() + + # self.decoder = ConditionalAttentionDecoder( + # input_dim, out_dim=num_classes, num_layers=1, num_heads=1, + # ff_hidden=100, target_vocab_size=num_classes, maximum_position_encoding=self.seq_len + # ) + self.decoder = FlatDecoder(out_dims=num_classes) self.crf_transition_params = None + self.supports_masking = True + self.use_graph = not no_graph - def __call__(self, token_ids, prefix_ids, suffix_ids, graph_ids, training=True): + # @tf.function + def __call__(self, token_ids, prefix_ids, suffix_ids, graph_ids, target=None, training=False, mask=None): """ - Inference + # Inference :param token_ids: ids for tokens, shape (?, seq_len) :param prefix_ids: ids for prefixes, shape (?, seq_len) :param suffix_ids: ids for suffixes, shape (?, seq_len) @@ -287,22 +151,35 @@ def __call__(self, token_ids, prefix_ids, suffix_ids, graph_ids, training=True): :param training: whether to finetune embeddings :return: logits for token classes, shape (?, seq_len, num_classes) """ + assert mask is not None, "Mask is required" + tok_emb = self.tok_emb(token_ids) - graph_emb = self.graph_emb(graph_ids) prefix_emb = self.prefix_emb(prefix_ids) suffix_emb = self.suffix_emb(suffix_ids) embs = tf.concat([tok_emb, - graph_emb, + # graph_emb, prefix_emb, suffix_emb], axis=-1) - logits = self.text_cnn(embs, training=training) + encoded = self.encoder(embs, training=training, mask=mask) + # if target is None: + # logits = self.decoder.seq_decode(encoded, training=training, mask=mask) + # else: + if self.use_graph: + graph_emb = self.graph_emb(graph_ids) + encoded = tf.concat([encoded, graph_emb], axis=-1) + logits, _ = self.decoder((encoded, target), training=training, mask=mask) # consider sending input instead of target return logits + def compute_mask(self, inputs, mask=None): + mask = self.encoder.compute_mask(None, mask=mask) + return self.decoder.compute_mask(None, mask=mask) - def loss(self, logits, labels, lengths, class_weights=None, extra_mask=None): + + # @tf.function + def loss(self, logits, labels, mask, class_weights=None, extra_mask=None): """ Compute cross-entropy loss for each meaningful tokens. Mask padded tokens. :param logits: shape (?, seq_len, num_classes) @@ -313,7 +190,7 @@ def loss(self, logits, labels, lengths, class_weights=None, extra_mask=None): :return: average cross-entropy loss """ losses = tf.nn.softmax_cross_entropy_with_logits(tf.one_hot(labels, depth=logits.shape[-1]), logits, axis=-1) - seq_mask = tf.sequence_mask(lengths, self.seq_len) + seq_mask = mask # logits._keras_mask# tf.sequence_mask(lengths, self.seq_len) if extra_mask is not None: seq_mask = tf.math.logical_and(seq_mask, extra_mask) if class_weights is None: @@ -329,28 +206,35 @@ def loss(self, logits, labels, lengths, class_weights=None, extra_mask=None): return loss - def score(self, logits, labels, lengths, scorer=None, extra_mask=None): + def score(self, logits, labels, mask, scorer=None, extra_mask=None): """ Compute precision, recall and f1 scores using the provided scorer function :param logits: shape (?, seq_len, num_classes) :param labels: ids of token labels, shape (?, seq_len) :param lengths: tensor of actual sentence lengths, shape (?,) - :param scorer: scorer function, takes `pred_labels` and `true_labels` as aguments + :param scorer: scorer function, takes `pred_labels` and `true_labels` as arguments :param extra_mask: mask for hiding some of the token labels, not counting them towards the score, shape (?, seq_len) :return: """ - mask = tf.sequence_mask(lengths, self.seq_len) + # mask = logits._keras_mask # tf.sequence_mask(lengths, self.seq_len) if extra_mask is not None: mask = tf.math.logical_and(mask, extra_mask) true_labels = tf.boolean_mask(labels, mask) argmax = tf.math.argmax(logits, axis=-1) estimated_labels = tf.cast(tf.boolean_mask(argmax, mask), tf.int32) - p, r, f1 = scorer(estimated_labels.numpy(), true_labels.numpy()) + p, r, f1 = scorer(to_numpy(estimated_labels), to_numpy(true_labels)) return p, r, f1 +def to_numpy(tensor): + if hasattr(tensor, "numpy"): + return tensor.numpy() + else: + return tf.make_ndarray(tf.make_tensor_proto(tensor)) + + # def estimate_crf_transitions(batches, n_tags): # transitions = [] # for _, _, labels, lengths in batches: @@ -359,7 +243,7 @@ def score(self, logits, labels, lengths, scorer=None, extra_mask=None): # # return np.stack(transitions, axis=0).mean(axis=0) - +# @tf.function def train_step_finetune(model, optimizer, token_ids, prefix, suffix, graph_ids, labels, lengths, extra_mask=None, class_weights=None, scorer=None, finetune=False): """ @@ -379,9 +263,11 @@ def train_step_finetune(model, optimizer, token_ids, prefix, suffix, graph_ids, :return: values for loss, precision, recall and f1-score """ with tf.GradientTape() as tape: - logits = model(token_ids, prefix, suffix, graph_ids, training=True) - loss = model.loss(logits, labels, lengths, class_weights=class_weights, extra_mask=extra_mask) - p, r, f1 = model.score(logits, labels, lengths, scorer=scorer, extra_mask=extra_mask) + seq_mask = tf.sequence_mask(lengths, token_ids.shape[1]) + logits = model(token_ids, prefix, suffix, graph_ids, target=None, training=True, mask=seq_mask) + loss = model.loss(logits, labels, mask=seq_mask, class_weights=class_weights, extra_mask=extra_mask) + # token_acc = tf.reduce_sum(tf.cast(tf.argmax(logits, axis=-1) == labels, tf.float32)) / (token_ids.shape[0] * token_ids.shape[1]) + p, r, f1 = model.score(logits, labels, mask=seq_mask, scorer=scorer, extra_mask=extra_mask) gradients = tape.gradient(loss, model.trainable_variables) if not finetune: # do not update embeddings @@ -393,7 +279,10 @@ def train_step_finetune(model, optimizer, token_ids, prefix, suffix, graph_ids, return loss, p, r, f1 # @tf.function -def test_step(model, token_ids, prefix, suffix, graph_ids, labels, lengths, extra_mask=None, class_weights=None, scorer=None): +def test_step( + model, token_ids, prefix, suffix, graph_ids, labels, lengths, extra_mask=None, class_weights=None, scorer=None, + no_localization=False +): """ :param model: TypePrediction model instance @@ -408,14 +297,44 @@ def test_step(model, token_ids, prefix, suffix, graph_ids, labels, lengths, extr :param scorer: scorer function, takes `pred_labels` and `true_labels` as aguments :return: values for loss, precision, recall and f1-score """ - logits = model(token_ids, prefix, suffix, graph_ids, training=False) - loss = model.loss(logits, labels, lengths, class_weights=class_weights, extra_mask=extra_mask) - p, r, f1 = model.score(logits, labels, lengths, scorer=scorer, extra_mask=extra_mask) + seq_mask = tf.sequence_mask(lengths, token_ids.shape[1]) + logits = model(token_ids, prefix, suffix, graph_ids, target=None, training=False, mask=seq_mask) + loss = model.loss(logits, labels, mask=seq_mask, class_weights=class_weights, extra_mask=extra_mask) + p, r, f1 = model.score(logits, labels, mask=seq_mask, scorer=scorer, extra_mask=extra_mask) return loss, p, r, f1 -def train(model, train_batches, test_batches, epochs, report_every=10, scorer=None, learning_rate=0.01, learning_rate_decay=1., finetune=False, summary_writer=None): +def test(model, test_batches, scorer=None): + test_alosses = [] + test_aps = [] + test_ars = [] + test_af1s = [] + + for ind, batch in enumerate(test_batches): + # token_ids, graph_ids, labels, class_weights, lengths = b + test_loss, test_p, test_r, test_f1 = test_step( + model=model, token_ids=batch['tok_ids'], + prefix=batch['prefix'], suffix=batch['suffix'], graph_ids=batch['graph_ids'], + labels=batch['tags'], lengths=batch['lens'], extra_mask=batch['hide_mask'], + # class_weights=batch['class_weights'], + scorer=scorer + ) + + test_alosses.append(test_loss) + test_aps.append(test_p) + test_ars.append(test_r) + test_af1s.append(test_f1) + + def avg(arr): + return sum(arr) / len(arr) + + return avg(test_alosses), avg(test_aps), avg(test_ars), avg(test_af1s) + +def train( + model, train_batches, test_batches, epochs, report_every=10, scorer=None, learning_rate=0.01, + learning_rate_decay=1., finetune=False, summary_writer=None, save_ckpt_fn=None, no_localization=False +): assert summary_writer is not None @@ -430,6 +349,8 @@ def train(model, train_batches, test_batches, epochs, report_every=10, scorer=No num_train_batches = len(train_batches) num_test_batches = len(test_batches) + best_f1 = 0. + try: with summary_writer.as_default(): @@ -440,12 +361,15 @@ def train(model, train_batches, test_batches, epochs, report_every=10, scorer=No rs = [] f1s = [] - for ind, batch in enumerate(train_batches): + start = time() + + for ind, batch in enumerate(tqdm(train_batches)): # token_ids, graph_ids, labels, class_weights, lengths = b loss, p, r, f1 = train_step_finetune( model=model, optimizer=optimizer, token_ids=batch['tok_ids'], prefix=batch['prefix'], suffix=batch['suffix'], graph_ids=batch['graph_ids'], - labels=batch['tags'], lengths=batch['lens'], extra_mask=batch['hide_mask'], + labels=batch['tags'], lengths=batch['lens'], + extra_mask=batch['no_loc_mask'] if no_localization else batch['hide_mask'], # class_weights=batch['class_weights'], scorer=scorer, finetune=finetune and e/epochs > 0.6 ) @@ -459,12 +383,18 @@ def train(model, train_batches, test_batches, epochs, report_every=10, scorer=No tf.summary.scalar("Recall/Train", r, step=e * num_train_batches + ind) tf.summary.scalar("F1/Train", f1, step=e * num_train_batches + ind) + test_alosses = [] + test_aps = [] + test_ars = [] + test_af1s = [] + for ind, batch in enumerate(test_batches): # token_ids, graph_ids, labels, class_weights, lengths = b test_loss, test_p, test_r, test_f1 = test_step( model=model, token_ids=batch['tok_ids'], prefix=batch['prefix'], suffix=batch['suffix'], graph_ids=batch['graph_ids'], - labels=batch['tags'], lengths=batch['lens'], extra_mask=batch['hide_mask'], + labels=batch['tags'], lengths=batch['lens'], + extra_mask=batch['no_loc_mask'] if no_localization else batch['hide_mask'], # class_weights=batch['class_weights'], scorer=scorer ) @@ -473,14 +403,24 @@ def train(model, train_batches, test_batches, epochs, report_every=10, scorer=No tf.summary.scalar("Precision/Test", test_p, step=e * num_test_batches + ind) tf.summary.scalar("Recall/Test", test_r, step=e * num_test_batches + ind) tf.summary.scalar("F1/Test", test_f1, step=e * num_test_batches + ind) + test_alosses.append(test_loss) + test_aps.append(test_p) + test_ars.append(test_r) + test_af1s.append(test_f1) - print(f"Epoch: {e}, Train Loss: {sum(losses) / len(losses)}, Train P: {sum(ps) / len(ps)}, Train R: {sum(rs) / len(rs)}, Train F1: {sum(f1s) / len(f1s)}, " - f"Test loss: {test_loss}, Test P: {test_p}, Test R: {test_r}, Test F1: {test_f1}") + epoch_time = time() - start train_losses.append(float(sum(losses) / len(losses))) train_f1s.append(float(sum(f1s) / len(f1s))) - test_losses.append(float(test_loss)) - test_f1s.append(float(test_f1)) + test_losses.append(float(sum(test_alosses) / len(test_alosses))) + test_f1s.append(float(sum(test_af1s) / len(test_af1s))) + + print(f"Epoch: {e}, {epoch_time: .2f} s, Train Loss: {train_losses[-1]: .4f}, Train P: {sum(ps) / len(ps): .4f}, Train R: {sum(rs) / len(rs): .4f}, Train F1: {sum(f1s) / len(f1s): .4f}, " + f"Test loss: {test_losses[-1]: .4f}, Test P: {sum(test_aps) / len(test_aps): .4f}, Test R: {sum(test_ars) / len(test_ars): .4f}, Test F1: {test_f1s[-1]: .4f}") + + if save_ckpt_fn is not None and float(test_f1s[-1]) > best_f1: + save_ckpt_fn() + best_f1 = float(test_f1s[-1]) lr.assign(lr * learning_rate_decay) diff --git a/SourceCodeTools/nlp/entity/type_prediction.py b/SourceCodeTools/nlp/entity/type_prediction.py index b15851b6..3394d0ef 100644 --- a/SourceCodeTools/nlp/entity/type_prediction.py +++ b/SourceCodeTools/nlp/entity/type_prediction.py @@ -1,25 +1,32 @@ from __future__ import unicode_literals, print_function import json +import logging import os import pickle from copy import copy from datetime import datetime +# os.environ['CUDA_VISIBLE_DEVICES'] = '-1' +os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' + +from pathlib import Path + import tensorflow from SourceCodeTools.nlp.batchers import PythonBatcher from SourceCodeTools.nlp.entity import parse_biluo from SourceCodeTools.nlp.entity.tf_models.params import cnn_params -from SourceCodeTools.nlp.entity.utils import get_unique_entities +from SourceCodeTools.nlp.entity.utils import get_unique_entities, overlap from SourceCodeTools.nlp.entity.utils.data import read_data def load_pkl_emb(path): """ - - :param path: - :return: + Load graph embeddings from a pickle file. Embeddigns are stored in class Embedder or in a list of Embedders. The + last embedder in the list is returned. + :param path: path to graph embeddigs stored as Embedder pickle + :return: Embedder object """ embedder = pickle.load(open(path, "rb")) if isinstance(embedder, list): @@ -27,7 +34,41 @@ def load_pkl_emb(path): return embedder -def scorer(pred, labels, tagmap, eps=1e-8): +def compute_precision_recall_f1(tp, fp, fn, eps=1e-8): + precision = tp / (tp + fp + eps) + recall = tp / (tp + fn + eps) + f1 = 2 * precision * recall / (precision + recall + eps) + return precision, recall, f1 + + +def localized_f1(pred_spans, true_spans, eps=1e-8): + + tp = 0. + fp = 0. + fn = 0. + + for pred, true in zip(pred_spans, true_spans): + if true != "O": + if true == pred: + tp += 1 + else: + if pred == "O": + fn += 1 + else: + fp += 1 + + return compute_precision_recall_f1(tp, fp, fn) + + +def span_f1(pred_spans, true_spans, eps=1e-8): + tp = len(pred_spans.intersection(true_spans)) + fp = len(pred_spans - true_spans) + fn = len(true_spans - pred_spans) + + return compute_precision_recall_f1(tp, fp, fn) + + +def scorer(pred, labels, tagmap, no_localization=False, eps=1e-8): """ Compute f1 score, precision, and recall from BILUO labels :param pred: predicted BILUO labels @@ -42,16 +83,13 @@ def scorer(pred, labels, tagmap, eps=1e-8): pred_biluo = [tagmap.inverse(p) for p in pred] labels_biluo = [tagmap.inverse(p) for p in labels] - pred_spans = set(parse_biluo(pred_biluo)) - true_spans = set(parse_biluo(labels_biluo)) + if not no_localization: + pred_spans = set(parse_biluo(pred_biluo)) + true_spans = set(parse_biluo(labels_biluo)) - tp = len(pred_spans.intersection(true_spans)) - fp = len(pred_spans - true_spans) - fn = len(true_spans - pred_spans) - - precision = tp / (tp + fp + eps) - recall = tp / (tp + fn + eps) - f1 = 2 * precision * recall / (precision + recall + eps) + precision, recall, f1 = span_f1(pred_spans, true_spans, eps=eps) + else: + precision, recall, f1 = localized_f1(pred_biluo, labels_biluo, eps=eps) return precision, recall, f1 @@ -74,7 +112,8 @@ def write_config(trial_dir, params, extra_params=None): class ModelTrainer: def __init__(self, train_data, test_data, params, graph_emb_path=None, word_emb_path=None, - output_dir=None, epochs=30, batch_size=32, seq_len=100, finetune=False, trials=1): + output_dir=None, epochs=30, batch_size=32, seq_len=100, finetune=False, trials=1, + no_localization=False, ckpt_path=None, no_graph=False): self.set_batcher_class() self.set_model_class() @@ -89,6 +128,9 @@ def __init__(self, train_data, test_data, params, graph_emb_path=None, word_emb_ self.finetune = finetune self.trials = trials self.seq_len = seq_len + self.no_localization = no_localization + self.ckpt_path = ckpt_path + self.no_graph = no_graph def set_batcher_class(self): self.batcher = PythonBatcher @@ -97,16 +139,23 @@ def set_model_class(self): from SourceCodeTools.nlp.entity.tf_models.tf_model import TypePredictor self.model = TypePredictor - def get_batcher(self, *args, **kwards): - return self.batcher(*args, **kwards) + def get_batcher(self, *args, **kwargs): + return self.batcher(*args, **kwargs) def get_model(self, *args, **kwargs): - return self.model(*args, **kwargs) + model = self.model(*args, **kwargs) + if self.ckpt_path is not None: + model.load_weights(os.path.join(self.ckpt_path, "checkpoint")) + return model def train(self, *args, **kwargs): from SourceCodeTools.nlp.entity.tf_models.tf_model import train return train(*args, summary_writer=self.summary_writer, **kwargs) + def test(self, *args, **kwargs): + from SourceCodeTools.nlp.entity.tf_models.tf_model import test + return test(*args, **kwargs) + def create_summary_writer(self, path): self.summary_writer = tensorflow.summary.create_file_writer(path) @@ -114,48 +163,69 @@ def create_summary_writer(self, path): # with self.summary_writer.as_default(): # tensorflow.summary.scalar(value_name, value, step=step) - def train_model(self): + def get_dataloaders(self, word_emb, graph_emb, suffix_prefix_buckets): - graph_emb = load_pkl_emb(self.graph_emb_path) - word_emb = load_pkl_emb(self.word_emb_path) - - suffix_prefix_buckets = params.pop("suffix_prefix_buckets") + if self.ckpt_path is not None: + tagmap = pickle.load(open(os.path.join(self.ckpt_path, "tag_types.pkl"), "rb")) + else: + tagmap = None train_batcher = self.get_batcher( - train_data, self.batch_size, seq_len=self.seq_len, graphmap=graph_emb.ind, wordmap=word_emb.ind, tagmap=None, - class_weights=False, element_hash_size=suffix_prefix_buckets + self.train_data, self.batch_size, seq_len=self.seq_len, + graphmap=graph_emb.ind if graph_emb is not None else None, + wordmap=word_emb.ind, tagmap=tagmap, + class_weights=False, element_hash_size=suffix_prefix_buckets, no_localization=self.no_localization ) test_batcher = self.get_batcher( - test_data, self.batch_size, seq_len=self.seq_len, graphmap=graph_emb.ind, wordmap=word_emb.ind, + self.test_data, self.batch_size, seq_len=self.seq_len, + graphmap=graph_emb.ind if graph_emb is not None else None, + wordmap=word_emb.ind, tagmap=train_batcher.tagmap, # use the same mapping - class_weights=False, element_hash_size=suffix_prefix_buckets # class_weights are not used for testing + class_weights=False, element_hash_size=suffix_prefix_buckets, # class_weights are not used for testing + no_localization=self.no_localization ) + return train_batcher, test_batcher + + def train_model(self): + + graph_emb = load_pkl_emb(self.graph_emb_path) if self.graph_emb_path is not None else None + word_emb = load_pkl_emb(self.word_emb_path) + + suffix_prefix_buckets = params.pop("suffix_prefix_buckets") + + train_batcher, test_batcher = self.get_dataloaders(word_emb, graph_emb, suffix_prefix_buckets) print(f"\n\n{params}") lr = params.pop("learning_rate") lr_decay = params.pop("learning_rate_decay") - param_dir = os.path.join(output_dir, str(datetime.now())) + timestamp = str(datetime.now()).replace(":","-").replace(" ","_") + param_dir = os.path.join(output_dir, timestamp) os.mkdir(param_dir) for trial_ind in range(self.trials): trial_dir = os.path.join(param_dir, repr(trial_ind)) + logging.info(f"Running trial: {timestamp}") os.mkdir(trial_dir) self.create_summary_writer(trial_dir) model = self.get_model( word_emb, graph_emb, train_embeddings=self.finetune, suffix_prefix_buckets=suffix_prefix_buckets, - num_classes=train_batcher.num_classes(), seq_len=self.seq_len, **params + num_classes=train_batcher.num_classes(), seq_len=self.seq_len, no_graph=self.no_graph, **params ) + def save_ckpt_fn(): + checkpoint_path = os.path.join(trial_dir, "checkpoint") + model.save_weights(checkpoint_path) + train_losses, train_f1, test_losses, test_f1 = self.train( model=model, train_batches=train_batcher, test_batches=test_batcher, - epochs=self.epochs, learning_rate=lr, scorer=lambda pred, true: scorer(pred, true, train_batcher.tagmap), - learning_rate_decay=lr_decay, finetune=self.finetune + epochs=self.epochs, learning_rate=lr, scorer=lambda pred, true: scorer(pred, true, train_batcher.tagmap, no_localization=self.no_localization), + learning_rate_decay=lr_decay, finetune=self.finetune, save_ckpt_fn=save_ckpt_fn, no_localization=self.no_localization ) - checkpoint_path = os.path.join(trial_dir, "checkpoint") - model.save_weights(checkpoint_path) + # checkpoint_path = os.path.join(trial_dir, "checkpoint") + # model.save_weights(checkpoint_path) metadata = { "train_losses": train_losses, @@ -166,9 +236,13 @@ def train_model(self): "learning_rate_decay": lr_decay, "epochs": self.epochs, "suffix_prefix_buckets": suffix_prefix_buckets, - "seq_len": self.seq_len + "seq_len": self.seq_len, + "batch_size": self.batch_size, + "no_localization": self.no_localization } + print("Maximum f1:", max(test_f1)) + # write_config(trial_dir, params, extra_params={"suffix_prefix_buckets": suffix_prefix_buckets, "seq_len": seq_len}) metadata.update(params) @@ -184,11 +258,13 @@ def get_type_prediction_arguments(): parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('--data_path', dest='data_path', default=None, - help='Path to the file with nodes') + help='Path to the dataset file') parser.add_argument('--graph_emb_path', dest='graph_emb_path', default=None, - help='Path to the file with edges') + help='Path to the file with graph embeddings') parser.add_argument('--word_emb_path', dest='word_emb_path', default=None, - help='Path to the file with edges') + help='Path to the file with token embeddings') + parser.add_argument('--type_ann_edges', dest='type_ann_edges', default=None, + help='Path to type annotation edges') parser.add_argument('--learning_rate', dest='learning_rate', default=0.01, type=float, help='') parser.add_argument('--learning_rate_decay', dest='learning_rate_decay', default=1.0, type=float, @@ -199,17 +275,34 @@ def get_type_prediction_arguments(): help='') parser.add_argument('--max_seq_len', dest='max_seq_len', default=100, type=int, help='') - # parser.add_argument('--pretrain_phase', dest='pretrain_phase', default=20, type=int, - # help='') + parser.add_argument('--min_entity_count', dest='min_entity_count', default=3, type=int, + help='') + parser.add_argument('--pretraining_epochs', dest='pretraining_epochs', default=0, type=int, + help='') + parser.add_argument('--ckpt_path', dest='ckpt_path', default=None, type=str, + help='') parser.add_argument('--epochs', dest='epochs', default=500, type=int, help='') parser.add_argument('--trials', dest='trials', default=1, type=int, help='') + parser.add_argument('--gpu', dest='gpu', default=-1, type=int, + help='Does not work with Tensorflow backend') parser.add_argument('--finetune', action='store_true') + parser.add_argument('--no_localization', action='store_true') + parser.add_argument('--restrict_allowed', action='store_true', default=False) + parser.add_argument('--no_graph', action='store_true', default=False) parser.add_argument('model_output', help='') args = parser.parse_args() + + if args.finetune is False and args.pretraining_epochs > 0: + logging.info(f"Finetuning is disabled, but the the number of pretraining epochs is {args.pretraining_epochs}. Setting pretraining epochs to 0.") + args.pretraining_epochs = 0 + + if args.graph_emb_path is not None and not os.path.isfile(args.graph_emb_path): + logging.warning(f"File with graph embeddings does not exist: {args.graph_emb_path}") + args.graph_emb_path = None return args @@ -219,6 +312,22 @@ def save_entities(path, entities): entitiesfile.write(f"{e}\n") +def filter_labels(dataset, allowed=None, field=None): + if allowed is None: + return dataset + dataset = copy(dataset) + for sent, annotations in dataset: + annotations["entities"] = [e for e in annotations["entities"] if e[2] in allowed] + return dataset + + +def find_example(dataset, needed_label): + for sent, annotations in dataset: + for start, end, e in annotations["entities"]: + if e == needed_label: + print(f"{sent}: {sent[start: end]}") + + if __name__ == "__main__": args = get_type_prediction_arguments() @@ -228,10 +337,27 @@ def save_entities(path, entities): # allowed = {'str', 'bool', 'Optional', 'None', 'int', 'Any', 'Union', 'List', 'Dict', 'Callable', 'ndarray', # 'FrameOrSeries', 'bytes', 'DataFrame', 'Matcher', 'float', 'Tuple', 'bool_t', 'Description', 'Type'} - - train_data, test_data = read_data( - open(args.data_path, "r").readlines(), normalize=True, allowed=None, include_replacements=True, include_only="entities", - min_entity_count=1, random_seed=args.random_seed + if args.restrict_allowed: + allowed = { + 'str', 'Optional', 'int', 'Any', 'Union', 'bool', 'Callable', 'Dict', 'bytes', 'float', 'Description', + 'List', 'Sequence', 'Namespace', 'T', 'Type', 'object', 'HTTPServerRequest', 'Future', "Matcher" + } + else: + allowed = None + + # train_data, test_data = read_data( + # open(args.data_path, "r").readlines(), normalize=True, allowed=None, include_replacements=True, include_only="entities", + # min_entity_count=args.min_entity_count, random_seed=args.random_seed + # ) + + dataset_dir = Path(args.data_path).parent + train_data = filter_labels( + pickle.load(open(dataset_dir.joinpath("type_prediction_dataset_no_defaults_train.pkl"), "rb")), + allowed=allowed + ) + test_data = filter_labels( + pickle.load(open(dataset_dir.joinpath("type_prediction_dataset_no_defaults_test.pkl"), "rb")), + allowed=allowed ) unique_entities = get_unique_entities(train_data, field="entities") @@ -241,6 +367,7 @@ def save_entities(path, entities): trainer = ModelTrainer( train_data, test_data, params, graph_emb_path=args.graph_emb_path, word_emb_path=args.word_emb_path, output_dir=output_dir, epochs=args.epochs, batch_size=args.batch_size, - finetune=args.finetune, trials=args.trials, seq_len=args.max_seq_len, + finetune=args.finetune, trials=args.trials, seq_len=args.max_seq_len, no_localization=args.no_localization, + ckpt_path=args.ckpt_path, no_graph=args.no_graph ) trainer.train_model() diff --git a/SourceCodeTools/nlp/entity/utils/__init__.py b/SourceCodeTools/nlp/entity/utils/__init__.py index 52df6d10..8cf742ca 100644 --- a/SourceCodeTools/nlp/entity/utils/__init__.py +++ b/SourceCodeTools/nlp/entity/utils/__init__.py @@ -24,7 +24,7 @@ def normalize_entities(entities): - norm = lambda x: x.split("[")[0].split(".")[-1] + norm = lambda x: x.strip("\"").strip("'").split("[")[0].split(".")[-1] if len(entities) == 0: return entities diff --git a/SourceCodeTools/nlp/entity/utils/data.py b/SourceCodeTools/nlp/entity/utils/data.py index 8ef6652c..8135b607 100644 --- a/SourceCodeTools/nlp/entity/utils/data.py +++ b/SourceCodeTools/nlp/entity/utils/data.py @@ -55,7 +55,7 @@ def read_data( logging.info("Splitting dataset randomly") else: random.seed(random_seed) - logging.warning(f"Using ransom seed {random_seed} for dataset split") + logging.warning(f"Using random seed {random_seed} for dataset split") filter_infrequent( train_data, entities_in_dataset=Counter(entities_in_dataset), diff --git a/SourceCodeTools/nlp/entity/utils/result-visualizer.py b/SourceCodeTools/nlp/entity/utils/result-visualizer.py index b6cf26d6..877ba961 100644 --- a/SourceCodeTools/nlp/entity/utils/result-visualizer.py +++ b/SourceCodeTools/nlp/entity/utils/result-visualizer.py @@ -41,28 +41,31 @@ if not os.path.isdir(output_path): os.mkdir(output_path) -plt.figure(figsize=(20, 10)) +# plt.figure(figsize=(20, 10)) for s in test_f1: plt.plot(s) plt.title("Test F1") plt.legend(legend, loc='upper left') -plt.savefig(os.path.join(output_path, "test_f1.png")) +plt.grid() +plt.savefig(os.path.join(output_path, "test_f1.svg")) plt.close() for train_loss, train_f1, test_loss, test_f1, label in zip(train_losses, train_f1, test_losses, test_f1, legend): - plt.figure(figsize=(20, 10)) + # plt.figure(figsize=(20, 10)) plt.plot(train_loss) plt.plot(test_loss) plt.yscale('log') + plt.grid() plt.title(f"{label} loss") plt.legend(["Train", "Test"], loc='upper left') - plt.savefig(os.path.join(output_path, f"{label}_loss.png")) + plt.savefig(os.path.join(output_path, f"{label}_loss.svg")) plt.close() - plt.figure(figsize=(20, 10)) + # plt.figure(figsize=(20, 10)) plt.plot(train_f1) plt.plot(test_f1) + plt.grid() plt.title(f"{label} f1") plt.legend(["Train", "test"], loc='upper left') - plt.savefig(os.path.join(output_path, f"{label}_f1.png")) + plt.savefig(os.path.join(output_path, f"{label}_f1.svg")) plt.close() diff --git a/SourceCodeTools/nlp/entity/utils/visualize_dataset.py b/SourceCodeTools/nlp/entity/utils/visualize_dataset.py new file mode 100644 index 00000000..7098156e --- /dev/null +++ b/SourceCodeTools/nlp/entity/utils/visualize_dataset.py @@ -0,0 +1,36 @@ +import os.path + +import spacy +import sys +import json +from spacy.gold import biluo_tags_from_offsets +from spacy.tokenizer import Tokenizer +import re + +from SourceCodeTools.nlp import create_tokenizer +from SourceCodeTools.nlp.entity import parse_biluo +from SourceCodeTools.nlp.entity.entity_render import render_annotations + +annotations_path = sys.argv[1] +output_path = os.path.dirname(annotations_path) + +data = [] +references = [] +results = [] + +nlp = create_tokenizer("spacy") + +with open(annotations_path) as annotations: + for line in annotations: + entry = json.loads(line.strip()) + + doc = nlp(entry['text']) + tags_r = parse_biluo(biluo_tags_from_offsets(doc, entry['replacements'])) + tags_e = parse_biluo(biluo_tags_from_offsets(doc, entry['ents'])) + + data.append([entry['text']]) + references.append(tags_r) + results.append(tags_e) + +html = render_annotations(zip(data, references, results)) +open(os.path.join(output_path, "annotations.html"), "w").write(html) \ No newline at end of file diff --git a/SourceCodeTools/nlp/generation/data.py b/SourceCodeTools/nlp/generation/data.py new file mode 100644 index 00000000..9f8f0c68 --- /dev/null +++ b/SourceCodeTools/nlp/generation/data.py @@ -0,0 +1 @@ +# dataloader for documentation generation \ No newline at end of file diff --git a/SourceCodeTools/nlp/tokenizers.py b/SourceCodeTools/nlp/tokenizers.py index e20fa9e7..40084d1c 100644 --- a/SourceCodeTools/nlp/tokenizers.py +++ b/SourceCodeTools/nlp/tokenizers.py @@ -75,9 +75,53 @@ def default_tokenizer(text): from SourceCodeTools.nlp.embed.bpe import load_bpe_model, make_tokenizer return make_tokenizer(load_bpe_model(bpe_path)) + elif type == "codebert": + from transformers import RobertaTokenizer + import spacy + from spacy.tokens import Doc + + tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base") + nlp = spacy.blank("en") + + def tokenize(text): + tokens = tokenizer.tokenize(text) + doc = Doc(nlp.vocab, tokens, spaces=[False] * len(tokens)) + return doc + + return tokenize else: raise Exception("Supported tokenizer types: spacy, regex, bpe") + +def codebert_to_spacy(tokens): + backup_tokens = tokens + fixed_spaces = [False] + fixed_words = [""] + + for ind, t in enumerate(tokens): + if len(t.text) > 1: + fixed_words.append(t.text.strip("Ġ")) + else: + fixed_words.append(t.text) + if ind != 0: + fixed_spaces.append(t.text.startswith("Ġ") and len(t.text) > 1) + fixed_spaces.append(False) + fixed_spaces.append(False) + fixed_words.append("") + + assert len(fixed_spaces) == len(fixed_words) + + from spacy.tokens import Doc + import spacy + doc = Doc(spacy.blank("en").vocab, fixed_words, fixed_spaces) + + assert len(doc) - 2 == len(backup_tokens) + assert len(doc.text) - 7 == len(backup_tokens.text) + + adjustment = -3 + # spans = [adjust_offsets(sp, -3) for sp in spans] + return doc, adjustment + # import tokenize # from io import BytesIO # tokenize.tokenize(BytesIO(s.encode('utf-8')).readline) \ No newline at end of file diff --git a/SourceCodeTools/tabular/common.py b/SourceCodeTools/tabular/common.py index 771945fa..ec4a74ac 100644 --- a/SourceCodeTools/tabular/common.py +++ b/SourceCodeTools/tabular/common.py @@ -2,6 +2,13 @@ def compact_property(values, return_order=False, index_from_one=False): + """ + Returns a map from the original value to the compact index + :param values: + :param return_order: + :param index_from_one: + :return: + """ uniq = numpy.unique(values) if index_from_one: index = range(1, uniq.size + 1) @@ -12,5 +19,5 @@ def compact_property(values, return_order=False, index_from_one=False): inv_index = uniq.tolist() if index_from_one: inv_index.insert(0, "NA") - return prop2pid, + return prop2pid, inv_index return prop2pid \ No newline at end of file diff --git a/Subgraph Classification.ipynb b/Subgraph Classification.ipynb new file mode 100644 index 00000000..332ebe7b --- /dev/null +++ b/Subgraph Classification.ipynb @@ -0,0 +1,816 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "244ac956", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%load_ext tensorboard" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bdefc65e", + "metadata": {}, + "outputs": [], + "source": [ + "from random import random\n", + "\n", + "from SourceCodeTools.models.training_config import get_config, save_config, load_config\n", + "from SourceCodeTools.code.data.dataset.Dataset import SourceGraphDataset, filter_dst_by_freq\n", + "from SourceCodeTools.models.graph.train.sampling_multitask2 import training_procedure, SamplingMultitaskTrainer\n", + "from SourceCodeTools.models.graph.train.objectives.NodeClassificationObjective import NodeClassifierObjective\n", + "from SourceCodeTools.models.graph.train.objectives.SubgraphClassifierObjective import SubgraphAbstractObjective, \\\n", + " SubgraphClassifierObjective, SubgraphEmbeddingObjective\n", + "from SourceCodeTools.models.graph.train.utils import get_name, get_model_base\n", + "from SourceCodeTools.models.graph import RGGAN\n", + "from SourceCodeTools.tabular.common import compact_property\n", + "from SourceCodeTools.code.data.file_utils import unpersist\n", + "from SourceCodeTools.models.graph.train.objectives.SCAAClassifierObjetive import SCAAClassifierObjective\n", + "import dgl\n", + "import torch\n", + "import numpy as np\n", + "from argparse import Namespace\n", + "from torch import nn\n", + "from datetime import datetime\n", + "from os.path import join\n", + "from functools import partial" + ] + }, + { + "cell_type": "markdown", + "id": "4becf482", + "metadata": { + "tags": [] + }, + "source": [ + "# Prepare parameters and options\n", + "\n", + "Full list of options that can be added can be found in `SourceCodeTools/models/training_options.py`. They are ment to be used as arguments for cli trainer. Trainer script can be found in `SourceCodeTools/scripts/train.py`.\n", + "\n", + "For the task of subgraph classification the important options are:\n", + "- `subgraph_partition` is path to subgraph-based train/val/test sets. Storead as Dataframe with subgraph id and partition mask\n", + "- `subgraph_id_column` is a column is `common_edges` file that stores subgraph id.\n", + "- For variable misuse task (same will apply to authorship attribution) subgraphs are created for individual functions (files for SCAA). The label is stored in `common_filecontent`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e5dc4fa", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5637f70e-b8fd-4a9f-9956-d60a61bd2870", + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer_path = \"examples/sentencepiece_bpe.model\"\n", + "\n", + "data_path = \"examples/one_to_one/with_ast\"\n", + "subgraph_partition = join(data_path, \"subgraph_partition.json.bz2\")\n", + "filecontent_path = join(data_path, \"common_filecontent.json.bz2\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "404565f5-09a3-4535-ba51-8533491f7c72", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0.1Unnamed: 0taskuseryearpackagefilecontentid
0249824983alex17702020False\\ndef getstring(): return input().strip()\\ndef...2498
1464446444sashadotcom2020Falseimport sys\\nfrom itertools import combinations...4644
2486448644superkjelle2020False[t,b] = input().split(' ')\\nt = int(t)\\nb = in...4864
32702700BananaNosh2020FalsePARENTS = {0: \"C\", 1: \"J\"}\\n\\n\\ndef print_outp...270
454002057TinyTitans2020Falsedef TestCase():\\n N = int(input())\\n ans...5400
...........................
55332892894Basti_SUI2020Falseimport sys\\n\\n\\ndef inverse(arr):\\n return ...289
5534255625561anari2020FalseT = int(input())\\nfor t in range(T):\\n N = ...2556
55353153150Bhavyalekha2020Falsedef f(raw_arr):\\n arr = []\\n for i in ra...315
5536153115311Poornachander2020Falset1=int(input())\\nli=[]\\nfor j in range(len(li)...1531
55378238233Hashcoder2020Falsefor tc in range(int(input())):\\n size, trac...823
\n", + "

5538 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0.1 Unnamed: 0 task user year package \\\n", + "0 2498 2498 3 alex1770 2020 False \n", + "1 4644 4644 4 sashadotcom 2020 False \n", + "2 4864 4864 4 superkjelle 2020 False \n", + "3 270 270 0 BananaNosh 2020 False \n", + "4 5400 205 7 TinyTitans 2020 False \n", + "... ... ... ... ... ... ... \n", + "5533 289 289 4 Basti_SUI 2020 False \n", + "5534 2556 2556 1 anari 2020 False \n", + "5535 315 315 0 Bhavyalekha 2020 False \n", + "5536 1531 1531 1 Poornachander 2020 False \n", + "5537 823 823 3 Hashcoder 2020 False \n", + "\n", + " filecontent id \n", + "0 \\ndef getstring(): return input().strip()\\ndef... 2498 \n", + "1 import sys\\nfrom itertools import combinations... 4644 \n", + "2 [t,b] = input().split(' ')\\nt = int(t)\\nb = in... 4864 \n", + "3 PARENTS = {0: \"C\", 1: \"J\"}\\n\\n\\ndef print_outp... 270 \n", + "4 def TestCase():\\n N = int(input())\\n ans... 5400 \n", + "... ... ... \n", + "5533 import sys\\n\\n\\ndef inverse(arr):\\n return ... 289 \n", + "5534 T = int(input())\\nfor t in range(T):\\n N = ... 2556 \n", + "5535 def f(raw_arr):\\n arr = []\\n for i in ra... 315 \n", + "5536 t1=int(input())\\nli=[]\\nfor j in range(len(li)... 1531 \n", + "5537 for tc in range(int(input())):\\n size, trac... 823 \n", + "\n", + "[5538 rows x 8 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unpersist(filecontent_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d09b740a-eaaf-4fac-a73f-0bd9bb671f70", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtypesource_node_idtarget_node_idfile_idmentioned_in
00subword2139055164822498NaN
11func516482257761249846283.0
22func_rev257761516482249846283.0
33value257761107985249846283.0
44value_rev107985257761249846283.0
.....................
2852961294prev2080306985515397693983.0
2862961295defined_in_module2080306939835397693983.0
2872961296defined_in_module_rev6939832080305397693983.0
2882961297subword7082314336115397NaN
2892961298subword3624244336115397NaN
\n", + "

290 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " id type source_node_id target_node_id file_id \\\n", + "0 0 subword 213905 516482 2498 \n", + "1 1 func 516482 257761 2498 \n", + "2 2 func_rev 257761 516482 2498 \n", + "3 3 value 257761 107985 2498 \n", + "4 4 value_rev 107985 257761 2498 \n", + ".. ... ... ... ... ... \n", + "285 2961294 prev 208030 698551 5397 \n", + "286 2961295 defined_in_module 208030 693983 5397 \n", + "287 2961296 defined_in_module_rev 693983 208030 5397 \n", + "288 2961297 subword 708231 433611 5397 \n", + "289 2961298 subword 362424 433611 5397 \n", + "\n", + " mentioned_in \n", + "0 NaN \n", + "1 46283.0 \n", + "2 46283.0 \n", + "3 46283.0 \n", + "4 46283.0 \n", + ".. ... \n", + "285 693983.0 \n", + "286 693983.0 \n", + "287 693983.0 \n", + "288 NaN \n", + "289 NaN \n", + "\n", + "[290 rows x 6 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unpersist(join(data_path, \"common_edges.json.bz2\"), nrows=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d38f83db", + "metadata": {}, + "outputs": [], + "source": [ + "config = get_config(\n", + " subgraph_id_column=\"file_id\",\n", + " data_path=data_path,\n", + " model_output_dir=data_path,\n", + " subgraph_partition=subgraph_partition,\n", + " tokenizer_path=tokenizer_path,\n", + " objectives=\"subgraph_clf\",\n", + " epochs=1000,\n", + " elem_emb_size=6, # the smallest size of the graph in dataset\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "bf61e1a2-2a2b-4688-b6b6-4ad3c22b22a5", + "metadata": {}, + "outputs": [], + "source": [ + "save_config(config, \"var_misuse_tiny.yaml\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "07dc7b86", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'DATASET': {'data_path': 'examples/one_to_one/with_ast',\n", + " 'train_frac': 0.9,\n", + " 'filter_edges': None,\n", + " 'min_count_for_objectives': 5,\n", + " 'self_loops': False,\n", + " 'use_node_types': False,\n", + " 'use_edge_types': False,\n", + " 'no_global_edges': False,\n", + " 'remove_reverse': False,\n", + " 'custom_reverse': None,\n", + " 'restricted_id_pool': None,\n", + " 'random_seed': None,\n", + " 'subgraph_id_column': 'file_id',\n", + " 'subgraph_partition': 'examples/one_to_one/with_ast/subgraph_partition.json.bz2'},\n", + " 'TRAINING': {'model_output_dir': 'examples/one_to_one/with_ast',\n", + " 'pretrained': None,\n", + " 'pretraining_phase': 0,\n", + " 'sampling_neighbourhood_size': 10,\n", + " 'neg_sampling_factor': 3,\n", + " 'use_layer_scheduling': False,\n", + " 'schedule_layers_every': 10,\n", + " 'elem_emb_size': 6,\n", + " 'embedding_table_size': 200000,\n", + " 'epochs': 1000,\n", + " 'batch_size': 128,\n", + " 'learning_rate': 0.001,\n", + " 'objectives': 'subgraph_clf',\n", + " 'save_each_epoch': False,\n", + " 'save_checkpoints': True,\n", + " 'early_stopping': False,\n", + " 'early_stopping_tolerance': 20,\n", + " 'force_w2v_ns': False,\n", + " 'use_ns_groups': False,\n", + " 'nn_index': 'brute',\n", + " 'metric': 'inner_prod',\n", + " 'measure_scores': False,\n", + " 'dilate_scores': 200,\n", + " 'gpu': -1,\n", + " 'external_dataset': None,\n", + " 'restore_state': False},\n", + " 'MODEL': {'node_emb_size': 100,\n", + " 'h_dim': 100,\n", + " 'n_layers': 5,\n", + " 'use_self_loop': True,\n", + " 'use_gcn_checkpoint': False,\n", + " 'use_att_checkpoint': False,\n", + " 'use_gru_checkpoint': False,\n", + " 'num_bases': 10,\n", + " 'dropout': 0.0,\n", + " 'activation': 'tanh'},\n", + " 'TOKENIZER': {'tokenizer_path': 'examples/sentencepiece_bpe.model'}}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "config" + ] + }, + { + "cell_type": "markdown", + "id": "069a2528", + "metadata": {}, + "source": [ + "# Create Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d0ce29fc", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/alina/workspace/scaa/method-embedding/SourceCodeTools/code/data/dataset/Dataset.py:129: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n", + " self.nodes = self.nodes.append(new_nodes, ignore_index=True)\n", + "/home/alina/workspace/scaa/method-embedding/SourceCodeTools/code/data/dataset/Dataset.py:130: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n", + " self.edges = self.edges.append(new_edges, ignore_index=True)\n" + ] + } + ], + "source": [ + "dataset = SourceGraphDataset(\n", + " **{**config[\"DATASET\"], **config[\"TOKENIZER\"]}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "897d41bf", + "metadata": {}, + "source": [ + "# Declare target loading function (labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c279f43e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def load_labels():\n", + " filecontent = unpersist(filecontent_path)\n", + " return filecontent[[\"id\", \"user\"]].rename({\"id\": \"src\", \"user\": \"dst\"}, axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "1ac27d90", + "metadata": {}, + "source": [ + "One or several objectives could be used" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "67605a81", + "metadata": {}, + "outputs": [], + "source": [ + "class Trainer(SamplingMultitaskTrainer):\n", + " def create_objectives(self, dataset, tokenizer_path):\n", + " self.objectives = nn.ModuleList()\n", + " \n", + " self.objectives.append(\n", + " self._create_subgraph_objective(\n", + " objective_name=\"VariableMisuseClf\",\n", + " objective_class=SCAAClassifierObjective,\n", + " dataset=dataset,\n", + " tokenizer_path=tokenizer_path,\n", + " labels_fn=load_labels,\n", + " )\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "393251c2", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "NOTE: Using experimental fast data loading logic. To disable, pass\n", + " \"--load_fast=false\" and report issues on GitHub. More details:\n", + " https://github.com/tensorflow/tensorboard/issues/4784\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tensorflow listening on http://localhost:6006/\n" + ] + } + ], + "source": [ + "# %tensorboard --logdir data\n", + "from tensorboard import program\n", + "\n", + "tracking_address = './eamples/one_to_one/' # the path of your log file.\n", + "\n", + "tb = program.TensorBoard()\n", + "tb.configure(argv=[None, '--logdir', tracking_address])\n", + "url = tb.launch()\n", + "print(f\"Tensorflow listening on {url}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "369ed1bb", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Precompute Target Embeddings: 0%| | 0/44 [00:00 + + + + Python Source Group + $PYTHON_ENV + + .py + + + . + + enabled + Python Source Group + + + 8 +" > $PRJ_PATH + fi +} + + +run_indexer () { + repo=$1 + if [ ! -f "$repo/sourcetrail.log" ]; then + run_indexing=true + else + find_edges=$(cat "$repo/sourcetrail.log" | grep " Edges") + if [ -z "$find_edges" ]; then + echo "Indexing was interrupted, recovering..." + run_indexing=true + else + run_indexing=false + fi + fi + + if $run_indexing; then + echo "Begin indexing" + Sourcetrail.sh index -i $repo/$repo.srctrlprj >> $repo/sourcetrail.log + else + echo "Already indexed" + fi +} + +#echo "Running from $RUN_DIR" +#echo "Python env path $PYTHON_ENV" +#echo "Data path $DATA_PATH" +#echo "Verifier path $VERIFIER_PATH" + +for repo_dir in $DATA_PATH/* +do +# echo $repo_dir +# repo=$DATA_PATH/$folder + if [ -d "$repo_dir" ] + then + repo="$(basename $repo_dir)" + echo $repo + cd $DATA_PATH + create_sourcetrail_project_if_not_exist $PYTHON_ENV "$repo/$repo.srctrlprj" + run_indexer "$repo" + + if [ -f "$repo/$repo.srctrldb" ]; then + cd "$repo" + echo $(pwd) + echo ".headers on +.mode csv +.output edges.csv +SELECT * FROM edge; +.output nodes.csv +SELECT * FROM node; +.output element_component.csv +SELECT * FROM element_component; +.output source_location.csv +SELECT * FROM source_location; +.output occurrence.csv +SELECT * FROM occurrence; +.output filecontent.csv +SELECT * FROM filecontent; +.quit" | sqlite3 "$repo.srctrldb" + python $VERIFIER_PATH . + cd .. + else + echo "Package not indexed" + fi + + cd $RUN_DIR + fi +done diff --git a/docker/preprocess_with_sourcetrail/sourcetrail_verify_files.py b/docker/preprocess_with_sourcetrail/sourcetrail_verify_files.py new file mode 100644 index 00000000..622ac79f --- /dev/null +++ b/docker/preprocess_with_sourcetrail/sourcetrail_verify_files.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +import pandas as pd +import sys, os + +def verify_files(working_dir): + + fileheaders = { + "edges.csv": "id,type,source_node_id,target_node_id\n", + "nodes.csv": "id,type,serialized_name\n", + "element_component.csv": "id,element_id,type,data\n", + "source_location.csv": "id,file_node_id,start_line,start_column,end_line,end_column,type\n", + "occurrence.csv": "element_id,source_location_id\n", + "filecontent.csv": "id,content\n" + } + + for filename, header in fileheaders.items(): + file_path = os.path.join(working_dir, filename) + try: + pd.read_csv(file_path) + except pd.errors.EmptyDataError: + with open(file_path, "w") as sink: + sink.write(header) + + +if __name__ == "__main__": + working_dir = sys.argv[1] + verify_files(working_dir) \ No newline at end of file diff --git a/examples/Node Classification 2nd part.ipynb b/examples/Node Classification 2nd part.ipynb new file mode 100644 index 00000000..a1790f6b --- /dev/null +++ b/examples/Node Classification 2nd part.ipynb @@ -0,0 +1,408 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%load_ext tensorboard" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using backend: pytorch\n" + ] + } + ], + "source": [ + "from random import random\n", + "\n", + "from SourceCodeTools.models.training_config import get_config, save_config, load_config\n", + "from SourceCodeTools.code.data.dataset.Dataset import SourceGraphDataset, filter_dst_by_freq\n", + "from SourceCodeTools.models.graph.train.sampling_multitask2 import training_procedure, SamplingMultitaskTrainer\n", + "from SourceCodeTools.models.graph.train.objectives.NodeClassificationObjective import NodeClassifierObjective\n", + "from SourceCodeTools.models.graph.train.utils import get_name, get_model_base\n", + "from SourceCodeTools.models.graph import RGGAN\n", + "from SourceCodeTools.tabular.common import compact_property\n", + "from SourceCodeTools.code.data.file_utils import unpersist\n", + "\n", + "import dgl\n", + "import torch\n", + "import numpy as np\n", + "from torch import nn\n", + "from datetime import datetime" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prepare parameters and options\n", + "\n", + "Full list of options that can be added can be found in `SourceCodeTools/models/training_options.py`. They are ment to be used as arguments for cli trainer. Trainer script can be found in `SourceCodeTools/scripts/train.py`.\n", + "\n", + "There are a lot of parameters. Ones that might be of interest are marked with `***`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "config = get_config(\n", + " # tokenizer\n", + " tokenizer_path=\"sentencepiece_bpe.model\", # *** path to sentencepiece model\n", + " \n", + " # dataset parameters\n", + " data_path=\"large_graph\", # *** path to node type\n", + " use_node_types=False, # node types currently not supported\n", + " use_edge_types=True, # whether to use edge types\n", + " filter_edges=None, # None or list of edge type names\n", + " self_loops=False, # whether to use self loops\n", + " train_frac=0.8, # *** fraction of nodes to use for training\n", + " random_seed=42, # random seed for splitting dataset int o train test validation\n", + " min_count_for_objectives=5, # *** minimum frequency of targets\n", + " no_global_edges=False, # remove global edges\n", + " remove_reverse=False, # remove reverse edges\n", + " custom_reverse=None, # None or list of edges, for which reverse edges should be created (use together with `remove_reverse`)\n", + " \n", + " # training parameters\n", + " model_output_dir=\"large_graph\", # *** directory to save checkpoints and training data\n", + " batch_size=128, # *** \n", + " sampling_neighbourhood_size=10, # number of dependencies to sample for each node\n", + " neg_sampling_factor=1, # *** number of negative samples for each positive sample\n", + " epochs=10, # *** number of epochs\n", + " elem_emb_size=100, # *** dimensionality of target embeddings (for node name prediction)\n", + " pretraining_phase=0, # number of epochs for pretraining\n", + " embedding_table_size=200000, # *** embedding table size for subwords\n", + " save_checkpoints=False, # set to False if checkpoints are not needed\n", + " save_each_epoch=False, # save each epoch, useful in case of studying model behavior\n", + " measure_scores=True, # *** measure ranking scores during evaluation\n", + " dilate_scores=200, # downsampling factor for measuring scores to make evaluation faster\n", + " objectives=\"node_clf\", # type of objective\n", + " force_w2v_ns=True, # negative sampling strategy\n", + " gpu=-1, # gpuid\n", + " restore_state=False,\n", + " pretrained=None,\n", + " \n", + " # model parameters\n", + " node_emb_size=100, # *** dimensionality of node embeddings\n", + " h_dim=100, # *** should match to node dimensionality\n", + " num_bases=10, # number of bases for computing parmetwer weights for different edge types\n", + " dropout=0.2, # *** \n", + " use_self_loop=True, #\n", + " activation=\"tanh\", # *** \n", + " learning_rate=1e-3, # *** \n", + " use_gcn_checkpoint=True,\n", + " use_att_checkpoint=True,\n", + " use_gru_checkpoint=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "save_config(config, \"config.yaml\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "config = load_config(\"config.yaml\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'DATASET': {'custom_reverse': None,\n", + " 'data_path': 'large_graph',\n", + " 'filter_edges': None,\n", + " 'min_count_for_objectives': 5,\n", + " 'no_global_edges': False,\n", + " 'random_seed': 42,\n", + " 'remove_reverse': False,\n", + " 'restricted_id_pool': None,\n", + " 'self_loops': False,\n", + " 'train_frac': 0.8,\n", + " 'use_edge_types': True,\n", + " 'use_node_types': False},\n", + " 'MODEL': {'activation': 'tanh',\n", + " 'dropout': 0.2,\n", + " 'h_dim': 100,\n", + " 'n_layers': 5,\n", + " 'node_emb_size': 100,\n", + " 'num_bases': 10,\n", + " 'use_att_checkpoint': True,\n", + " 'use_gcn_checkpoint': True,\n", + " 'use_gru_checkpoint': True,\n", + " 'use_self_loop': True},\n", + " 'TOKENIZER': {'tokenizer_path': 'sentencepiece_bpe.model'},\n", + " 'TRAINING': {'batch_size': 128,\n", + " 'dilate_scores': 200,\n", + " 'early_stopping': False,\n", + " 'early_stopping_tolerance': 20,\n", + " 'elem_emb_size': 100,\n", + " 'embedding_table_size': 200000,\n", + " 'epochs': 10,\n", + " 'external_dataset': None,\n", + " 'force_w2v_ns': True,\n", + " 'gpu': -1,\n", + " 'learning_rate': 0.001,\n", + " 'measure_scores': True,\n", + " 'metric': 'inner_prod',\n", + " 'model_output_dir': 'large_graph',\n", + " 'neg_sampling_factor': 1,\n", + " 'nn_index': 'brute',\n", + " 'objectives': 'node_clf',\n", + " 'pretrained': None,\n", + " 'pretraining_phase': 0,\n", + " 'restore_state': False,\n", + " 'sampling_neighbourhood_size': 10,\n", + " 'save_checkpoints': False,\n", + " 'save_each_epoch': False,\n", + " 'schedule_layers_every': 10,\n", + " 'use_layer_scheduling': False,\n", + " 'use_ns_groups': False}}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "config" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:Random state for splitting dataset is fixed\n" + ] + } + ], + "source": [ + "dataset = SourceGraphDataset(\n", + " **{**config[\"DATASET\"], **config[\"TOKENIZER\"]},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Declare target loading function (labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def load_type_prediction():\n", + " from SourceCodeTools.code.data.dataset.reader import load_data\n", + " \n", + " nodes, edges = dataset.nodes, dataset.edges\n", + " \n", + " type_ann = unpersist(\"large_graph/type_annotations.json.bz2\").query(\"src in @node_ids\", local_dict={\"node_ids\": nodes[\"id\"]})\n", + " \n", + " norm = lambda x: x.strip(\"\\\"\").strip(\"'\").split(\"[\")[0].split(\".\")[-1]\n", + "\n", + " type_ann[\"dst\"] = type_ann[\"dst\"].apply(norm)\n", + " type_ann = filter_dst_by_freq(type_ann, config[\"DATASET\"][\"min_count_for_objectives\"])\n", + " type_ann = type_ann[[\"src\", \"dst\"]]\n", + "\n", + " return type_ann" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Define objectives\n", + "\n", + "Currenlty objectives for node classification (`NodeClassifierObjective`), and name-based node embedding training `SubwordEmbedderObjective`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](\"examples/figures/img1.png)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One or several objectives could be used" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "class Trainer(SamplingMultitaskTrainer):\n", + " def create_objectives(self, dataset, tokenizer_path):\n", + " self.objectives = nn.ModuleList()\n", + " \n", + "# self.objectives.append(\n", + "# NodeClassifierObjective(\n", + "# \"NodeTypeClassifier\",\n", + "# self.graph_model, self.node_embedder, dataset.nodes,\n", + "# dataset.load_node_classes, # need to define this function\n", + "# self.device, self.sampling_neighbourhood_size, self.batch_size,\n", + "# tokenizer_path=tokenizer_path, target_emb_size=self.elem_emb_size,\n", + "# masker=dataset.create_node_clf_masker(), # this is needed only for node type classification\n", + "# measure_scores=self.trainer_params[\"measure_scores\"],\n", + "# dilate_scores=self.trainer_params[\"dilate_scores\"]\n", + "# )\n", + "# )\n", + " \n", + " self.objectives.append(\n", + " NodeClassifierObjective(\n", + " \"TypeAnnPrediction\",\n", + " self.graph_model, self.node_embedder, dataset.nodes,\n", + " load_type_prediction, # need to define this function\n", + " self.device, self.sampling_neighbourhood_size, self.batch_size,\n", + " tokenizer_path=tokenizer_path, target_emb_size=self.elem_emb_size, \n", + " masker=None, # masker is not needed here\n", + " measure_scores=self.trainer_params[\"measure_scores\"],\n", + " dilate_scores=self.trainer_params[\"dilate_scores\"]\n", + " )\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ERROR: Could not find `tensorboard`. Please ensure that your PATH\n", + "contains an executable `tensorboard` program, or explicitly specify\n", + "the path to a TensorBoard binary by setting the `TENSORBOARD_BINARY`\n", + "environment variable." + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%tensorboard --logdir \"large_graph\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of nodes 324218\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Epoch 0: 0%| | 0/30 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtypenamementioned_instring
00moduleExampleModule<NA><NA>
11classExampleModule.ExampleClass<NA><NA>
22class_methodExampleModule.ExampleClass.__init__<NA><NA>
33non_indexed_symbolbuiltins<NA><NA>
44classbuiltins.int<NA><NA>
..................
145152mentionprint@FunctionDef_0x16d41315c9c41f53143<NA>
146153CallCall_0x16d41315c9148a75143print(a+b)
147154BinOpBinOp_0x16d41315c919b5a9143a+b
148155mentionmain@Module_0x16d41315c9361936138<NA>
149156CallCall_0x16d41315c99405f0138main()
\n", + "

150 rows × 5 columns

\n", + "" + ], + "text/plain": [ + " id type name \\\n", + "0 0 module ExampleModule \n", + "1 1 class ExampleModule.ExampleClass \n", + "2 2 class_method ExampleModule.ExampleClass.__init__ \n", + "3 3 non_indexed_symbol builtins \n", + "4 4 class builtins.int \n", + ".. ... ... ... \n", + "145 152 mention print@FunctionDef_0x16d41315c9c41f53 \n", + "146 153 Call Call_0x16d41315c9148a75 \n", + "147 154 BinOp BinOp_0x16d41315c919b5a9 \n", + "148 155 mention main@Module_0x16d41315c9361936 \n", + "149 156 Call Call_0x16d41315c99405f0 \n", + "\n", + " mentioned_in string \n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 \n", + ".. ... ... \n", + "145 143 \n", + "146 143 print(a+b) \n", + "147 143 a+b \n", + "148 138 \n", + "149 138 main() \n", + "\n", + "[150 rows x 5 columns]" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nodes" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "40cbd760-f8c0-42a7-94cc-0a6e12aa2735", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtypesrcdstfile_idmentioned_in
00defines01NaN<NA>
11defines12NaN<NA>
22defines34NaN<NA>
33uses_type24NaN<NA>
44defines15NaN<NA>
.....................
410464func_rev15615532.0138
411465next14315632.0138
412466prev15614332.0138
413467defined_in_module15613832.0138
414468defined_in_module_rev13815632.0138
\n", + "

415 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " id type src dst file_id mentioned_in\n", + "0 0 defines 0 1 NaN \n", + "1 1 defines 1 2 NaN \n", + "2 2 defines 3 4 NaN \n", + "3 3 uses_type 2 4 NaN \n", + "4 4 defines 1 5 NaN \n", + ".. ... ... ... ... ... ...\n", + "410 464 func_rev 156 155 32.0 138\n", + "411 465 next 143 156 32.0 138\n", + "412 466 prev 156 143 32.0 138\n", + "413 467 defined_in_module 156 138 32.0 138\n", + "414 468 defined_in_module_rev 138 156 32.0 138\n", + "\n", + "[415 rows x 6 columns]" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "edges" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "99a5bcf2-1835-4ef2-b6c4-190a76f477fc", + "metadata": {}, + "outputs": [], + "source": [ + "assert all(edges.eval(\"src in @node_ids\", local_dict={\"node_ids\": nodes[\"id\"]}))\n", + "assert all(edges.eval(\"dst in @node_ids\", local_dict={\"node_ids\": nodes[\"id\"]}))" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "bbd3686e-0227-4673-8f9f-2c4e25cb2d2e", + "metadata": {}, + "outputs": [], + "source": [ + "nodes = nodes[[\"id\", \"type\", \"name\"]]\n", + "edges = edges[[\"id\", \"type\", \"src\", \"dst\"]]" + ] + }, + { + "cell_type": "markdown", + "id": "e33b5977-7e85-4d79-bb9f-a13d930a8275", + "metadata": {}, + "source": [ + "## Reading type annotations" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "1bdb3167-ce60-4ac7-8c70-79ec664c1e85", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
srcdst
022int
235str
449str
653int
856str
1079None
12104int
14128str
\n", + "
" + ], + "text/plain": [ + " src dst\n", + "0 22 int\n", + "2 35 str\n", + "4 49 str\n", + "6 53 int\n", + "8 56 str\n", + "10 79 None\n", + "12 104 int\n", + "14 128 str" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type_annotations = unpersist(\"small_graph/type_annotations.json\").query(\"src in @node_ids\", local_dict={\"node_ids\": nodes[\"id\"]})\n", + "type_annotations" + ] + }, + { + "cell_type": "markdown", + "id": "3bbe4302-77fc-4c52-939b-f658a08b7c80", + "metadata": {}, + "source": [ + "# Preprocessing graph\n", + "## Removing some edges" + ] + }, + { + "cell_type": "markdown", + "id": "72fdc4f8-f0d8-44d8-a680-99c777aa8a35", + "metadata": {}, + "source": [ + "As an exercise, we remove some edge types" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "d90f13af-c576-4da4-a40a-5f17b22182d0", + "metadata": {}, + "outputs": [], + "source": [ + "def remove_global_edges(edges):\n", + " global_edges = SourceGraphDataset.get_global_edges()\n", + " is_ast = lambda type: type not in global_edges\n", + " edges = edges.query(\"type.map(@is_ast)\", local_dict={\"is_ast\": is_ast})\n", + " return edges" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "d4d5fd73-1591-427b-89d4-5e6abadfffcf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtypesrcdst
5050subword1415
5151arg1519
5252arg_rev1915
5353args1920
5454args_rev2019
...............
410464func_rev156155
411465next143156
412466prev156143
413467defined_in_module156138
414468defined_in_module_rev138156
\n", + "

327 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " id type src dst\n", + "50 50 subword 14 15\n", + "51 51 arg 15 19\n", + "52 52 arg_rev 19 15\n", + "53 53 args 19 20\n", + "54 54 args_rev 20 19\n", + ".. ... ... ... ...\n", + "410 464 func_rev 156 155\n", + "411 465 next 143 156\n", + "412 466 prev 156 143\n", + "413 467 defined_in_module 156 138\n", + "414 468 defined_in_module_rev 138 156\n", + "\n", + "[327 rows x 4 columns]" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "edges_ast = remove_global_edges(edges)\n", + "edges_ast" + ] + }, + { + "cell_type": "markdown", + "id": "0fd6e0c9-63f9-4e81-865d-22557bb97800", + "metadata": {}, + "source": [ + "## Making sure no isolated nodes are present" + ] + }, + { + "cell_type": "markdown", + "id": "35a9d53e-85bc-435a-ac13-9087c7401008", + "metadata": {}, + "source": [ + "After graph has been edited, need to make sure there are no isolated nodes. They will cause errors when training GNN." + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "cc1e0844-b5f4-4a0c-89d2-ca5f8992f8d4", + "metadata": {}, + "outputs": [], + "source": [ + "def ensure_connectedness(nodes, edges):\n", + " \"\"\"\n", + " Filter isolated nodes\n", + " :param nodes: DataFrame\n", + " :param edges: DataFrame\n", + " :return:\n", + " \"\"\"\n", + " unique_connected_nodes = set(edges['src'].append(edges['dst']))\n", + " \n", + " nodes = nodes.query(\"id in @unique_connected_nodes\", local_dict={\"unique_connected_nodes\": unique_connected_nodes})\n", + " \n", + " print(f\"Ending up with {len(nodes)} nodes and {len(edges)} edges\")\n", + " return nodes, edges" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "2758500f-3b46-4192-a364-5f1e65da9275", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ending up with 150 nodes and 415 edges\n" + ] + } + ], + "source": [ + "nodes, edges = ensure_connectedness(nodes, edges)" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "1fcf3230-8028-4a8a-8ce4-df30c7a3b563", + "metadata": {}, + "outputs": [], + "source": [ + "assert all(edges.eval(\"src in @node_ids\", local_dict={\"node_ids\": nodes[\"id\"]}))\n", + "assert all(edges.eval(\"dst in @node_ids\", local_dict={\"node_ids\": nodes[\"id\"]}))" + ] + }, + { + "cell_type": "markdown", + "id": "5942c29a-7076-4b0c-9590-0551b6b7a74e", + "metadata": {}, + "source": [ + "## Adding extra node information" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "551ff497-6660-4ad9-be08-33d059bdd86b", + "metadata": {}, + "outputs": [], + "source": [ + "def format_node_types(nodes):\n", + " \"\"\"\n", + " DGL confuses some node types with internal objects, need to change current type names\n", + " \"\"\"\n", + " nodes = nodes.copy() # copying is slow for large datasets, prefer in-place operations\n", + " nodes['type_backup'] = nodes['type']\n", + " nodes['type'] = nodes['type'].apply(lambda x: f\"{x}_\")\n", + " # nodes['type'] = \"node_\"\n", + " # nodes = nodes.astype({'type': 'category'})\n", + " return nodes\n", + "\n", + "# def add_embedding_names(nodes):\n", + "# \"\"\"\n", + "# Embedding names are used for initial embeddings (layer 0)\n", + "# \"\"\"\n", + "# nodes = nodes.copy()\n", + "# nodes[\"embeddable\"] = True\n", + "# nodes[\"embeddable_name\"] = nodes[\"name\"].apply(SourceGraphDataset.get_embeddable_name)\n", + "# return nodes\n", + "\n", + "def add_splits(nodes, train_frac, restricted_id_pool=None):\n", + " nodes = nodes.copy()\n", + " \n", + " def random_partition():\n", + " r = random()\n", + " if r < train_frac:\n", + " return \"train\"\n", + " elif r < train_frac + (1 - train_frac) / 2:\n", + " return \"val\"\n", + " else:\n", + " return \"test\"\n", + " \n", + " import numpy as np\n", + " # define partitioning\n", + " masks = np.array([random_partition() for _ in range(len(nodes))])\n", + " \n", + " # create masks\n", + " nodes[\"train_mask\"] = masks == \"train\"\n", + " nodes[\"val_mask\"] = masks == \"val\"\n", + " nodes[\"test_mask\"] = masks == \"test\"\n", + " \n", + " if restricted_id_pool is not None:\n", + " # if `restricted_id_pool` is provided, mask all nodes not in `restricted_id_pool` negatively\n", + " to_keep = nodes.eval(\"id in @restricted_ids\", local_dict={\"restricted_ids\": restricted_id_pool})\n", + " nodes[\"train_mask\"] = nodes[\"train_mask\"] & to_keep\n", + " nodes[\"test_mask\"] = nodes[\"test_mask\"] & to_keep\n", + " nodes[\"val_mask\"] = nodes[\"val_mask\"] & to_keep\n", + " \n", + " return nodes" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "63b0ec32-69aa-4793-b73c-8fb766adecc5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train examples: 4\n", + "Test examples: 1\n", + "Validation examples: 3\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtypenametype_backuptrain_maskval_masktest_mask
00module_ExampleModulemoduleFalseFalseFalse
11class_ExampleModule.ExampleClassclassFalseFalseFalse
22class_method_ExampleModule.ExampleClass.__init__class_methodFalseFalseFalse
33non_indexed_symbol_builtinsnon_indexed_symbolFalseFalseFalse
44class_builtins.intclassFalseFalseFalse
........................
145152mention_print@FunctionDef_0x16d41315c9c41f53mentionFalseFalseFalse
146153Call_Call_0x16d41315c9148a75CallFalseFalseFalse
147154BinOp_BinOp_0x16d41315c919b5a9BinOpFalseFalseFalse
148155mention_main@Module_0x16d41315c9361936mentionFalseFalseFalse
149156Call_Call_0x16d41315c99405f0CallFalseFalseFalse
\n", + "

150 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " id type name \\\n", + "0 0 module_ ExampleModule \n", + "1 1 class_ ExampleModule.ExampleClass \n", + "2 2 class_method_ ExampleModule.ExampleClass.__init__ \n", + "3 3 non_indexed_symbol_ builtins \n", + "4 4 class_ builtins.int \n", + ".. ... ... ... \n", + "145 152 mention_ print@FunctionDef_0x16d41315c9c41f53 \n", + "146 153 Call_ Call_0x16d41315c9148a75 \n", + "147 154 BinOp_ BinOp_0x16d41315c919b5a9 \n", + "148 155 mention_ main@Module_0x16d41315c9361936 \n", + "149 156 Call_ Call_0x16d41315c99405f0 \n", + "\n", + " type_backup train_mask val_mask test_mask \n", + "0 module False False False \n", + "1 class False False False \n", + "2 class_method False False False \n", + "3 non_indexed_symbol False False False \n", + "4 class False False False \n", + ".. ... ... ... ... \n", + "145 mention False False False \n", + "146 Call False False False \n", + "147 BinOp False False False \n", + "148 mention False False False \n", + "149 Call False False False \n", + "\n", + "[150 rows x 7 columns]" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nodes = add_splits(format_node_types(nodes), 0.5, restricted_id_pool=type_annotations[\"src\"])\n", + "print(\"Train examples:\", len(nodes.query(\"train_mask == True\")))\n", + "print(\"Test examples:\", len(nodes.query(\"test_mask == True\")))\n", + "print(\"Validation examples:\", len(nodes.query(\"val_mask == True\")))\n", + "nodes" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "fe87c0bd-4294-4978-bbba-a3f24bf73024", + "metadata": {}, + "outputs": [], + "source": [ + "def add_type_dependent_dense_ids_to_nodes(nodes):\n", + " \"\"\"\n", + " DGL requires dense ids: https://docs.dgl.ai/en/latest/generated/dgl.heterograph.html#dgl.heterograph\n", + " Compute dense ids for each node type\n", + " \"\"\"\n", + " nodes = nodes.copy()\n", + "\n", + " typed_id_map = {}\n", + "\n", + " for type_ in nodes['type'].unique():\n", + " # create mask for the current node type\n", + " type_mask = nodes['type'] == type_\n", + "\n", + " # `compact_property` will create a dense mapping\n", + " # it is equivalent to dict(zip(node_ids, range(len(node_ids))))\n", + " id_map = compact_property(nodes.loc[type_mask, 'id'])\n", + "\n", + " # add a new column with dense type-dependent ids\n", + " nodes.loc[type_mask, 'typed_id'] = nodes.loc[type_mask, 'id'].apply(lambda old_id: id_map[old_id])\n", + "\n", + " # store for further reference\n", + " typed_id_map[type_] = id_map\n", + "\n", + " nodes = nodes.astype({\"typed_id\": \"int64\"})\n", + " return nodes, typed_id_map" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "40b80587-d44c-4093-b074-66a434354264", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtypenametype_backuptrain_maskval_masktest_masktyped_id
00module_ExampleModulemoduleFalseFalseFalse0
11class_ExampleModule.ExampleClassclassFalseFalseFalse0
22class_method_ExampleModule.ExampleClass.__init__class_methodFalseFalseFalse0
33non_indexed_symbol_builtinsnon_indexed_symbolFalseFalseFalse0
44class_builtins.intclassFalseFalseFalse1
...........................
145152mention_print@FunctionDef_0x16d41315c9c41f53mentionFalseFalseFalse33
146153Call_Call_0x16d41315c9148a75CallFalseFalseFalse9
147154BinOp_BinOp_0x16d41315c919b5a9BinOpFalseFalseFalse1
148155mention_main@Module_0x16d41315c9361936mentionFalseFalseFalse34
149156Call_Call_0x16d41315c99405f0CallFalseFalseFalse10
\n", + "

150 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " id type name \\\n", + "0 0 module_ ExampleModule \n", + "1 1 class_ ExampleModule.ExampleClass \n", + "2 2 class_method_ ExampleModule.ExampleClass.__init__ \n", + "3 3 non_indexed_symbol_ builtins \n", + "4 4 class_ builtins.int \n", + ".. ... ... ... \n", + "145 152 mention_ print@FunctionDef_0x16d41315c9c41f53 \n", + "146 153 Call_ Call_0x16d41315c9148a75 \n", + "147 154 BinOp_ BinOp_0x16d41315c919b5a9 \n", + "148 155 mention_ main@Module_0x16d41315c9361936 \n", + "149 156 Call_ Call_0x16d41315c99405f0 \n", + "\n", + " type_backup train_mask val_mask test_mask typed_id \n", + "0 module False False False 0 \n", + "1 class False False False 0 \n", + "2 class_method False False False 0 \n", + "3 non_indexed_symbol False False False 0 \n", + "4 class False False False 1 \n", + ".. ... ... ... ... ... \n", + "145 mention False False False 33 \n", + "146 Call False False False 9 \n", + "147 BinOp False False False 1 \n", + "148 mention False False False 34 \n", + "149 Call False False False 10 \n", + "\n", + "[150 rows x 8 columns]" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nodes, typed_id_map = add_type_dependent_dense_ids_to_nodes(nodes)\n", + "nodes" + ] + }, + { + "cell_type": "markdown", + "id": "1cd0fa5c-d17e-4273-a7a4-b7a1fbea93da", + "metadata": {}, + "source": [ + "## Adding extra edge information" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "be0d6b46-2b58-471a-880c-baf1f4f2a38a", + "metadata": {}, + "outputs": [], + "source": [ + "def format_edge_types(edges):\n", + " \"\"\"\n", + " DGL confuses some edge types with internal objects, need to change current type names\n", + " \"\"\"\n", + " edges = edges.copy()\n", + " edges['type'] = edges['type'].apply(lambda x: f\"{x}_\")\n", + " return edges" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "70626daf-c6f4-46cb-a405-828e93eeea6e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtypesrcdst
00defines_01
11defines_12
22defines_34
33uses_type_24
44defines_15
...............
410464func_rev_156155
411465next_143156
412466prev_156143
413467defined_in_module_156138
414468defined_in_module_rev_138156
\n", + "

415 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " id type src dst\n", + "0 0 defines_ 0 1\n", + "1 1 defines_ 1 2\n", + "2 2 defines_ 3 4\n", + "3 3 uses_type_ 2 4\n", + "4 4 defines_ 1 5\n", + ".. ... ... ... ...\n", + "410 464 func_rev_ 156 155\n", + "411 465 next_ 143 156\n", + "412 466 prev_ 156 143\n", + "413 467 defined_in_module_ 156 138\n", + "414 468 defined_in_module_rev_ 138 156\n", + "\n", + "[415 rows x 4 columns]" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "edges = format_edge_types(edges)\n", + "edges" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "ca377c26-bd4c-403b-a628-d33b2b3aad06", + "metadata": {}, + "outputs": [], + "source": [ + "def add_node_types_to_edges(nodes, edges):\n", + " \"\"\"\n", + " Add node types because they are needed for refining edge signatures\n", + " \"\"\"\n", + " edges = edges.copy()\n", + " node_type_map = dict(zip(nodes['id'], nodes['type']))\n", + "\n", + " edges['src_type'] = edges['src'].apply(lambda src_id: node_type_map[src_id])\n", + " edges['dst_type'] = edges['dst'].apply(lambda dst_id: node_type_map[dst_id])\n", + " edges = edges.astype({'src_type': 'category', 'dst_type': 'category'})\n", + "\n", + " return edges" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "5d330c29-65eb-4b18-9003-d89503e5747e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtypesrcdstsrc_typedst_type
00defines_01module_class_
11defines_12class_class_method_
22defines_34non_indexed_symbol_class_
33uses_type_24class_method_class_
44defines_15class_class_field_
.....................
410464func_rev_156155Call_mention_
411465next_143156FunctionDef_Call_
412466prev_156143Call_FunctionDef_
413467defined_in_module_156138Call_Module_
414468defined_in_module_rev_138156Module_Call_
\n", + "

415 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " id type src dst src_type dst_type\n", + "0 0 defines_ 0 1 module_ class_\n", + "1 1 defines_ 1 2 class_ class_method_\n", + "2 2 defines_ 3 4 non_indexed_symbol_ class_\n", + "3 3 uses_type_ 2 4 class_method_ class_\n", + "4 4 defines_ 1 5 class_ class_field_\n", + ".. ... ... ... ... ... ...\n", + "410 464 func_rev_ 156 155 Call_ mention_\n", + "411 465 next_ 143 156 FunctionDef_ Call_\n", + "412 466 prev_ 156 143 Call_ FunctionDef_\n", + "413 467 defined_in_module_ 156 138 Call_ Module_\n", + "414 468 defined_in_module_rev_ 138 156 Module_ Call_\n", + "\n", + "[415 rows x 6 columns]" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "edges = add_node_types_to_edges(nodes, edges)\n", + "edges" + ] + }, + { + "cell_type": "markdown", + "id": "05ddca13-ee7f-4525-9813-0938abc35536", + "metadata": {}, + "source": [ + "# Building graph" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "6ccab980-1be4-44c7-a220-3f849c9ed2c3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Unique nodes: 150, node types: 26\n", + "Unique edges: 415, edge types: 50\n" + ] + } + ], + "source": [ + "print(f\"Unique nodes: {len(nodes)}, node types: {len(nodes['type'].unique())}\")\n", + "print(f\"Unique edges: {len(edges)}, edge types: {len(edges['type'].unique())}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "94b006cb-9821-4e13-b03b-26dae995f9c3", + "metadata": {}, + "outputs": [], + "source": [ + "def add_global_dense_graph_id(nodes, graph, typed_id_map):\n", + " \"\"\"\n", + " Add dense global node ids to make it easier working with embeddings in the future\n", + " \"\"\"\n", + " orig_id = []\n", + " graph_id = []\n", + " prev_offset = 0\n", + " \n", + " nodes = nodes.copy()\n", + "\n", + " # simply assign global id in the order node types appear in `graph.ntypes`\n", + " for type_ in graph.ntypes:\n", + " from_id, to_id = zip(*typed_id_map[type_].items())\n", + " orig_id.extend(from_id)\n", + " graph_id.extend([t + prev_offset for t in to_id])\n", + " prev_offset += graph.number_of_nodes(type_)\n", + "\n", + " global_map = dict(zip(orig_id, graph_id))\n", + "\n", + " nodes['global_graph_id'] = nodes['id'].apply(lambda old_id: global_map[old_id])\n", + " \n", + " return nodes\n", + "\n", + "def add_node_data(graph, nodes):\n", + " field_types = {\n", + " \"train_mask\": torch.bool,\n", + " \"test_mask\": torch.bool,\n", + " \"val_mask\": torch.bool,\n", + " \"typed_id\": torch.int64,\n", + " \"original_id\": torch.int64,\n", + " \"global_graph_id\": torch.int64,\n", + " }\n", + " \n", + " for ntype in graph.ntypes:\n", + " node_data = nodes.query(f\"type == '{ntype}'\").sort_values('typed_id').rename({\"id\": \"original_id\"}, axis=1)\n", + " \n", + " for field_name, field_type in field_types.items():\n", + " graph.nodes[ntype].data[field_name] = torch.tensor(node_data[field_name].values, dtype=field_type)\n", + " \n", + " return graph\n", + "\n", + "def create_hetero_graph(nodes, edges, typed_id_map):\n", + " # nodes = nodes.copy()\n", + " # edges = edges.copy()\n", + "\n", + " # edges = add_node_types_to_edges(nodes, edges)\n", + "\n", + " typed_node_id = dict(zip(nodes['id'], nodes['typed_id']))\n", + "\n", + " typed_subgraphs = {}\n", + "\n", + " # group by in pandas is slow, use something else for large datasets\n", + " for signature, signature_edges in edges.groupby(['src_type', 'type', 'dst_type']): \n", + " # `signature` is a tuple (src_type, edge_type, dst_type)\n", + " typed_subgraphs[signature] = list(\n", + " zip(\n", + " signature_edges['src'].map(lambda old_id: typed_node_id[old_id]),\n", + " signature_edges['dst'].map(lambda old_id: typed_node_id[old_id])\n", + " )\n", + " )\n", + "\n", + " print(\n", + " f\"Unique triplet types in the graph: {len(typed_subgraphs.keys())}\"\n", + " )\n", + "\n", + " g = dgl.heterograph(typed_subgraphs)\n", + " \n", + " nodes = add_global_dense_graph_id(nodes, g, typed_id_map)\n", + " \n", + " g = add_node_data(g, nodes)\n", + " return g, nodes, edges" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "7d81d65a-2f1b-4525-b4e9-2cbdeb234980", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Unique triplet types in the graph: 140\n" + ] + } + ], + "source": [ + "g, nodes, edges = create_hetero_graph(nodes, edges, typed_id_map)" + ] + }, + { + "cell_type": "markdown", + "id": "f461773c-620c-45c9-9ce5-1d26f8f9c72d", + "metadata": {}, + "source": [ + "## Graph Atributes" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "25fc1ce2-309c-40c2-9bff-e6455800dc7f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['#attr#_',\n", + " 'AnnAssign_',\n", + " 'Assign_',\n", + " 'Attribute_',\n", + " 'BinOp_',\n", + " 'Call_',\n", + " 'ClassDef_',\n", + " 'Constant_',\n", + " 'FunctionDef_',\n", + " 'ImportFrom_',\n", + " 'JoinedStr_',\n", + " 'Module_',\n", + " 'Op_',\n", + " 'Return_',\n", + " 'alias_',\n", + " 'arg_',\n", + " 'arguments_',\n", + " 'class_',\n", + " 'class_field_',\n", + " 'class_method_',\n", + " 'function_',\n", + " 'global_variable_',\n", + " 'mention_',\n", + " 'module_',\n", + " 'non_indexed_symbol_',\n", + " 'subword_']" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "g.ntypes" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "b5110724-8f41-4e93-b4a7-689aea7760a2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "NodeSpace(data={'train_mask': tensor([False, False, False]), 'test_mask': tensor([False, False, False]), 'val_mask': tensor([False, False, False]), 'typed_id': tensor([0, 1, 2]), 'original_id': tensor([ 0, 10, 91]), 'global_graph_id': tensor([122, 123, 124])})" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "g.nodes[\"module_\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "id": "17680572-d69d-4f5c-9409-a662992c0307", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'#attr#_': tensor([False, False, False, False]),\n", + " 'AnnAssign_': tensor([False, False]),\n", + " 'Assign_': tensor([False, False, False, False, False]),\n", + " 'Attribute_': tensor([False, False, False, False, False, False, False]),\n", + " 'BinOp_': tensor([False, False]),\n", + " 'Call_': tensor([False, False, False, False, False, False, False, False, False, False,\n", + " False]),\n", + " 'ClassDef_': tensor([False, False]),\n", + " 'Constant_': tensor([False]),\n", + " 'FunctionDef_': tensor([False, True, False, True, False, False, False, False]),\n", + " 'ImportFrom_': tensor([False, False]),\n", + " 'JoinedStr_': tensor([False]),\n", + " 'Module_': tensor([False, False, False, False]),\n", + " 'Op_': tensor([False]),\n", + " 'Return_': tensor([False, False, False, False]),\n", + " 'alias_': tensor([False, False]),\n", + " 'arg_': tensor([False, False, False, False, False, False, False, False, False]),\n", + " 'arguments_': tensor([False, False, False, False, False, False]),\n", + " 'class_': tensor([False, False, False, False]),\n", + " 'class_field_': tensor([False, False]),\n", + " 'class_method_': tensor([False, False, False]),\n", + " 'function_': tensor([False, False, False, False, False, False]),\n", + " 'global_variable_': tensor([False]),\n", + " 'mention_': tensor([False, False, False, False, False, False, True, False, False, False,\n", + " False, False, False, False, False, False, False, False, False, True,\n", + " False, False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False]),\n", + " 'module_': tensor([False, False, False]),\n", + " 'non_indexed_symbol_': tensor([False]),\n", + " 'subword_': tensor([False, False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False])}" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "g.ndata[\"train_mask\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "bfbb97fa-2c68-4290-b74e-b63f0b5cd226", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('#attr#_', 'attr_', 'Attribute_'),\n", + " ('AnnAssign_', 'defined_in_function_', 'FunctionDef_'),\n", + " ('AnnAssign_', 'next_', 'AnnAssign_'),\n", + " ('AnnAssign_', 'next_', 'Return_'),\n", + " ('AnnAssign_', 'prev_', 'AnnAssign_'),\n", + " ('AnnAssign_', 'target_rev_', 'mention_'),\n", + " ('AnnAssign_', 'value_rev_', 'Attribute_'),\n", + " ('AnnAssign_', 'value_rev_', 'Call_'),\n", + " ('Assign_', 'defined_in_function_', 'FunctionDef_'),\n", + " ('Assign_', 'defined_in_module_', 'Module_'),\n", + " ('Assign_', 'next_', 'Assign_'),\n", + " ('Assign_', 'next_', 'Call_'),\n", + " ('Assign_', 'next_', 'FunctionDef_'),\n", + " ('Assign_', 'prev_', 'Assign_'),\n", + " ('Assign_', 'prev_', 'ImportFrom_'),\n", + " ('Assign_', 'targets_rev_', 'Attribute_'),\n", + " ('Assign_', 'targets_rev_', 'mention_'),\n", + " ('Assign_', 'value_rev_', 'Call_'),\n", + " ('Assign_', 'value_rev_', 'mention_'),\n", + " ('Attribute_', 'func_', 'Call_'),\n", + " ('Attribute_', 'left_', 'BinOp_'),\n", + " ('Attribute_', 'right_', 'BinOp_'),\n", + " ('Attribute_', 'targets_', 'Assign_'),\n", + " ('Attribute_', 'value_', 'AnnAssign_'),\n", + " ('Attribute_', 'value_rev_', 'mention_'),\n", + " ('BinOp_', 'args_', 'Call_'),\n", + " ('BinOp_', 'left_rev_', 'Attribute_'),\n", + " ('BinOp_', 'left_rev_', 'mention_'),\n", + " ('BinOp_', 'right_rev_', 'Attribute_'),\n", + " ('BinOp_', 'right_rev_', 'mention_'),\n", + " ('Call_', 'args_', 'Call_'),\n", + " ('Call_', 'args_rev_', 'BinOp_'),\n", + " ('Call_', 'args_rev_', 'Call_'),\n", + " ('Call_', 'args_rev_', 'mention_'),\n", + " ('Call_', 'defined_in_function_', 'FunctionDef_'),\n", + " ('Call_', 'defined_in_module_', 'Module_'),\n", + " ('Call_', 'func_rev_', 'Attribute_'),\n", + " ('Call_', 'func_rev_', 'mention_'),\n", + " ('Call_', 'prev_', 'Assign_'),\n", + " ('Call_', 'prev_', 'FunctionDef_'),\n", + " ('Call_', 'value_', 'AnnAssign_'),\n", + " ('Call_', 'value_', 'Assign_'),\n", + " ('Call_', 'value_', 'Return_'),\n", + " ('ClassDef_', 'class_name_', 'mention_'),\n", + " ('ClassDef_', 'defined_in_class_rev_', 'FunctionDef_'),\n", + " ('ClassDef_', 'defined_in_module_', 'Module_'),\n", + " ('Constant_', 'args_', 'Call_'),\n", + " ('FunctionDef_', 'args_rev_', 'arguments_'),\n", + " ('FunctionDef_', 'defined_in_class_', 'ClassDef_'),\n", + " ('FunctionDef_', 'defined_in_function_rev_', 'AnnAssign_'),\n", + " ('FunctionDef_', 'defined_in_function_rev_', 'Assign_'),\n", + " ('FunctionDef_', 'defined_in_function_rev_', 'Call_'),\n", + " ('FunctionDef_', 'defined_in_function_rev_', 'Return_'),\n", + " ('FunctionDef_', 'defined_in_module_', 'Module_'),\n", + " ('FunctionDef_', 'function_name_', 'mention_'),\n", + " ('FunctionDef_', 'next_', 'Call_'),\n", + " ('FunctionDef_', 'next_', 'FunctionDef_'),\n", + " ('FunctionDef_', 'prev_', 'Assign_'),\n", + " ('FunctionDef_', 'prev_', 'FunctionDef_'),\n", + " ('FunctionDef_', 'prev_', 'ImportFrom_'),\n", + " ('ImportFrom_', 'defined_in_module_', 'Module_'),\n", + " ('ImportFrom_', 'module_rev_', 'mention_'),\n", + " ('ImportFrom_', 'names_rev_', 'alias_'),\n", + " ('ImportFrom_', 'next_', 'Assign_'),\n", + " ('ImportFrom_', 'next_', 'FunctionDef_'),\n", + " ('JoinedStr_', 'value_', 'Return_'),\n", + " ('Module_', 'defined_in_module_rev_', 'Assign_'),\n", + " ('Module_', 'defined_in_module_rev_', 'Call_'),\n", + " ('Module_', 'defined_in_module_rev_', 'ClassDef_'),\n", + " ('Module_', 'defined_in_module_rev_', 'FunctionDef_'),\n", + " ('Module_', 'defined_in_module_rev_', 'ImportFrom_'),\n", + " ('Op_', 'op_', 'BinOp_'),\n", + " ('Return_', 'defined_in_function_', 'FunctionDef_'),\n", + " ('Return_', 'prev_', 'AnnAssign_'),\n", + " ('Return_', 'value_rev_', 'Call_'),\n", + " ('Return_', 'value_rev_', 'mention_'),\n", + " ('alias_', 'asname_rev_', 'mention_'),\n", + " ('alias_', 'name_rev_', 'mention_'),\n", + " ('alias_', 'names_', 'ImportFrom_'),\n", + " ('arg_', 'arg_rev_', 'mention_'),\n", + " ('arg_', 'args_', 'arguments_'),\n", + " ('arguments_', 'args_', 'FunctionDef_'),\n", + " ('arguments_', 'args_rev_', 'arg_'),\n", + " ('class_', 'defined_in_', 'module_'),\n", + " ('class_', 'defined_in_', 'non_indexed_symbol_'),\n", + " ('class_', 'defines_', 'class_field_'),\n", + " ('class_', 'defines_', 'class_method_'),\n", + " ('class_', 'defines_', 'function_'),\n", + " ('class_', 'global_mention_', 'mention_'),\n", + " ('class_', 'imported_by_', 'module_'),\n", + " ('class_', 'type_used_by_', 'class_method_'),\n", + " ('class_', 'type_used_by_', 'function_'),\n", + " ('class_field_', 'defined_in_', 'class_'),\n", + " ('class_field_', 'used_by_', 'class_method_'),\n", + " ('class_field_', 'used_by_', 'function_'),\n", + " ('class_method_', 'called_by_', 'function_'),\n", + " ('class_method_', 'called_by_', 'module_'),\n", + " ('class_method_', 'defined_in_', 'class_'),\n", + " ('class_method_', 'global_mention_', 'mention_'),\n", + " ('class_method_', 'uses_', 'class_field_'),\n", + " ('class_method_', 'uses_type_', 'class_'),\n", + " ('function_', 'called_by_', 'function_'),\n", + " ('function_', 'called_by_', 'module_'),\n", + " ('function_', 'calls_', 'class_method_'),\n", + " ('function_', 'calls_', 'function_'),\n", + " ('function_', 'defined_in_', 'class_'),\n", + " ('function_', 'defined_in_', 'non_indexed_symbol_'),\n", + " ('function_', 'global_mention_', 'mention_'),\n", + " ('function_', 'uses_', 'class_field_'),\n", + " ('function_', 'uses_', 'global_variable_'),\n", + " ('function_', 'uses_type_', 'class_'),\n", + " ('global_variable_', 'defined_in_', 'module_'),\n", + " ('global_variable_', 'global_mention_', 'mention_'),\n", + " ('global_variable_', 'used_by_', 'function_'),\n", + " ('mention_', 'arg_', 'arg_'),\n", + " ('mention_', 'args_', 'Call_'),\n", + " ('mention_', 'asname_', 'alias_'),\n", + " ('mention_', 'class_name_rev_', 'ClassDef_'),\n", + " ('mention_', 'func_', 'Call_'),\n", + " ('mention_', 'function_name_rev_', 'FunctionDef_'),\n", + " ('mention_', 'left_', 'BinOp_'),\n", + " ('mention_', 'module_', 'ImportFrom_'),\n", + " ('mention_', 'name_', 'alias_'),\n", + " ('mention_', 'right_', 'BinOp_'),\n", + " ('mention_', 'target_', 'AnnAssign_'),\n", + " ('mention_', 'targets_', 'Assign_'),\n", + " ('mention_', 'value_', 'Assign_'),\n", + " ('mention_', 'value_', 'Attribute_'),\n", + " ('mention_', 'value_', 'Return_'),\n", + " ('module_', 'calls_', 'class_method_'),\n", + " ('module_', 'calls_', 'function_'),\n", + " ('module_', 'defines_', 'class_'),\n", + " ('module_', 'defines_', 'global_variable_'),\n", + " ('module_', 'global_mention_', 'mention_'),\n", + " ('module_', 'imports_', 'class_'),\n", + " ('module_', 'used_by_', 'module_'),\n", + " ('module_', 'uses_', 'module_'),\n", + " ('non_indexed_symbol_', 'defines_', 'class_'),\n", + " ('non_indexed_symbol_', 'defines_', 'function_'),\n", + " ('subword_', 'subword_', 'mention_')]" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "g.canonical_etypes" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "b4b3e786-a021-4048-9b0c-020733171683", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "EdgeSpace(data={})" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "g.edges[('subword_', 'subword_', 'mention_')]" + ] + }, + { + "cell_type": "markdown", + "id": "1df19fd9-7675-41ba-bcf7-99121f15048f", + "metadata": {}, + "source": [ + "## Dataloaders" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "1a41f3b6-2f04-4ac3-8c2c-109c26c6d578", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'FunctionDef_': tensor([1, 3]), 'mention_': tensor([ 6, 19])}" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def get_train_nodes(graph):\n", + " train_nodes = {}\n", + " for node_type, mask in g.ndata[\"train_mask\"].items():\n", + " train_ids = g.ndata[\"typed_id\"][node_type][mask]\n", + " if len(train_ids) > 0:\n", + " train_nodes[node_type] = train_ids\n", + " return train_nodes\n", + "\n", + "get_train_nodes(g)" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "id": "ef3e7b73-925c-43e8-a2ab-c25ba710251d", + "metadata": {}, + "outputs": [], + "source": [ + "sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)\n", + "loader = dgl.dataloading.NodeDataLoader(\n", + " g, get_train_nodes(g), sampler, batch_size=1, shuffle=False, num_workers=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "id": "ba78130d-64e9-460d-87fb-122f413902f2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Batch: 0\n", + "Seeds:\n", + "FunctionDef_ tensor([35])\n", + "\n", + "Input nodes:\n", + "ClassDef_ tensor([17])\n", + "FunctionDef_ tensor([35, 16, 49])\n", + "Return_ tensor([45])\n", + "arguments_ tensor([37])\n", + "mention_ tensor([46])\n", + "\n", + "Layer 0\n", + "tensor([17]) --> ('ClassDef_', 'defined_in_class_rev_', 'FunctionDef_') --> tensor([35])\n", + "tensor([35, 16, 49]) --> ('FunctionDef_', 'next_', 'FunctionDef_') --> tensor([35])\n", + "tensor([35, 16, 49]) --> ('FunctionDef_', 'prev_', 'FunctionDef_') --> tensor([35])\n", + "tensor([45]) --> ('Return_', 'defined_in_function_', 'FunctionDef_') --> tensor([35])\n", + "tensor([37]) --> ('arguments_', 'args_', 'FunctionDef_') --> tensor([35])\n", + "tensor([46]) --> ('mention_', 'function_name_rev_', 'FunctionDef_') --> tensor([35])\n", + "\n", + "\n", + "Batch: 1\n", + "Seeds:\n", + "FunctionDef_ tensor([79])\n", + "\n", + "Input nodes:\n", + "Assign_ tensor([75])\n", + "Call_ tensor([83, 90])\n", + "FunctionDef_ tensor([79])\n", + "Module_ tensor([66])\n", + "mention_ tensor([89])\n", + "\n", + "Layer 0\n", + "tensor([75]) --> ('Assign_', 'next_', 'FunctionDef_') --> tensor([79])\n", + "tensor([83, 90]) --> ('Call_', 'defined_in_function_', 'FunctionDef_') --> tensor([79])\n", + "tensor([83, 90]) --> ('Call_', 'prev_', 'FunctionDef_') --> tensor([79])\n", + "tensor([66]) --> ('Module_', 'defined_in_module_rev_', 'FunctionDef_') --> tensor([79])\n", + "tensor([89]) --> ('mention_', 'function_name_rev_', 'FunctionDef_') --> tensor([79])\n", + "\n", + "\n", + "Batch: 2\n", + "Seeds:\n", + "mention_ tensor([53])\n", + "\n", + "Input nodes:\n", + "AnnAssign_ tensor([54])\n", + "Call_ tensor([59])\n", + "mention_ tensor([53])\n", + "subword_ tensor([52, 47])\n", + "\n", + "Layer 0\n", + "tensor([54]) --> ('AnnAssign_', 'target_rev_', 'mention_') --> tensor([53])\n", + "tensor([59]) --> ('Call_', 'args_rev_', 'mention_') --> tensor([53])\n", + "tensor([52, 47]) --> ('subword_', 'subword_', 'mention_') --> tensor([53])\n", + "\n", + "\n", + "Batch: 3\n", + "Seeds:\n", + "mention_ tensor([104])\n", + "\n", + "Input nodes:\n", + "Assign_ tensor([106])\n", + "arg_ tensor([105])\n", + "mention_ tensor([104])\n", + "subword_ tensor([103])\n", + "\n", + "Layer 0\n", + "tensor([106]) --> ('Assign_', 'value_rev_', 'mention_') --> tensor([104])\n", + "tensor([105]) --> ('arg_', 'arg_rev_', 'mention_') --> tensor([104])\n", + "tensor([103]) --> ('subword_', 'subword_', 'mention_') --> tensor([104])\n", + "\n", + "\n" + ] + } + ], + "source": [ + "for ind, (input_nodes, seeds, blocks) in enumerate(loader):\n", + " print(\"Batch:\", ind)\n", + " \n", + " print(\"Seeds:\")\n", + " for key, val in seeds.items():\n", + " if len(val) > 0:\n", + " print(key, blocks[-1].dstnodes[key].data[\"original_id\"])\n", + " \n", + " print()\n", + " \n", + " print(\"Input nodes:\")\n", + " for key, val in input_nodes.items():\n", + " if len(val) > 0:\n", + " print(key, blocks[0].srcnodes[key].data[\"original_id\"])\n", + " \n", + " print()\n", + " \n", + " for b_ind, block in enumerate(blocks):\n", + " print(\"Layer\", b_ind)\n", + " for etype in block.canonical_etypes:\n", + " if block[etype].num_edges() > 0:\n", + " # for srctype, dsttype in zip(\n", + " # print(blocks[0][etype].adj().to_dense())\n", + " print(block.srcnodes[etype[0]].data[\"original_id\"], \"-->\", etype, \"-->\", block.dstnodes[etype[2]].data[\"original_id\"]) #, block[etype].num_edges()) \n", + " print()\n", + " print()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f18ad6d7-8e92-48a5-9888-f9c2c9471414", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:SourceCodeTools]", + "language": "python", + "name": "conda-env-SourceCodeTools-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/SCAA AuthorMatching.ipynb b/examples/SCAA AuthorMatching.ipynb new file mode 100644 index 00000000..9e1a30dd --- /dev/null +++ b/examples/SCAA AuthorMatching.ipynb @@ -0,0 +1,1239 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "244ac956", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%load_ext tensorboard" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "bdefc65e", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'SourceCodeTools.models.graph.train.objectives.SubgraphEmbedderObjective'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Input \u001b[0;32mIn [1]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mSourceCodeTools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgraph\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtrain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mobjectives\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mNodeClassificationObjective\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m NodeClassifierObjective\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mSourceCodeTools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgraph\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtrain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mobjectives\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mSubgraphClassifierObjective\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m SubgraphAbstractObjective, \\\n\u001b[1;32m 8\u001b[0m SubgraphClassifierObjective, SubgraphEmbeddingObjective\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mSourceCodeTools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgraph\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtrain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mobjectives\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mSubgraphEmbedderObjective\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m SubgraphEmbeddingObjective, \\\n\u001b[1;32m 10\u001b[0m SubgraphMatchingObjective\n\u001b[1;32m 11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mSourceCodeTools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgraph\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtrain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m get_name, get_model_base\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mSourceCodeTools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgraph\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m RGGAN\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'SourceCodeTools.models.graph.train.objectives.SubgraphEmbedderObjective'" + ] + } + ], + "source": [ + "from random import random\n", + "\n", + "from SourceCodeTools.models.training_config import get_config, save_config, load_config\n", + "from SourceCodeTools.code.data.dataset.Dataset import SourceGraphDataset, filter_dst_by_freq\n", + "from SourceCodeTools.models.graph.train.sampling_multitask2 import training_procedure, SamplingMultitaskTrainer\n", + "from SourceCodeTools.models.graph.train.objectives.NodeClassificationObjective import NodeClassifierObjective\n", + "from SourceCodeTools.models.graph.train.objectives.SubgraphClassifierObjective import SubgraphAbstractObjective, \\\n", + " SubgraphClassifierObjective, SubgraphEmbeddingObjective\n", + "from SourceCodeTools.models.graph.train.objectives.SubgraphEmbedderObjective import SubgraphEmbeddingObjective, \\\n", + " SubgraphMatchingObjective\n", + "from SourceCodeTools.models.graph.train.utils import get_name, get_model_base\n", + "from SourceCodeTools.models.graph import RGGAN\n", + "from SourceCodeTools.tabular.common import compact_property\n", + "from SourceCodeTools.code.data.file_utils import unpersist\n", + "\n", + "import dgl\n", + "import torch\n", + "import numpy as np\n", + "from argparse import Namespace\n", + "from torch import nn\n", + "from datetime import datetime\n", + "from os.path import join\n", + "from functools import partial" + ] + }, + { + "cell_type": "markdown", + "id": "4becf482", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "tags": [] + }, + "source": [ + "# Prepare parameters and options\n", + "\n", + "Full list of options that can be added can be found in `SourceCodeTools/models/training_options.py`. They are ment to be used as arguments for cli trainer. Trainer script can be found in `SourceCodeTools/scripts/train.py`.\n", + "\n", + "For the task of subgraph classification the important options are:\n", + "- `subgraph_partition` is path to subgraph-based train/val/test sets. Storead as Dataframe with subgraph id and partition mask\n", + "- `subgraph_id_column` is a column is `common_edges` file that stores subgraph id.\n", + "- For variable misuse task (same will apply to authorship attribution) subgraphs are created for individual functions (files for SCAA). The label is stored in `common_filecontent`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5637f70e-b8fd-4a9f-9956-d60a61bd2870", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "tokenizer_path = \"sentencepiece_bpe.model\"\n", + "\n", + "data_path = \"scaa_9_tasks_python_graph\"\n", + "subgraph_partition = join(data_path, \"subgraph_partition.json.bz2\")\n", + "filecontent_path = join(data_path, \"common_filecontent.json.bz2\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "404565f5-09a3-4535-ba51-8533491f7c72", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtrain_maskval_masktest_mask
0489TrueFalseFalse
1474FalseFalseTrue
2310TrueFalseFalse
380TrueFalseFalse
4374TrueFalseFalse
...............
50848TrueFalseFalse
509108TrueFalseFalse
510484TrueFalseFalse
511406TrueFalseFalse
512152TrueFalseFalse
\n", + "

513 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " id train_mask val_mask test_mask\n", + "0 489 True False False\n", + "1 474 False False True\n", + "2 310 True False False\n", + "3 80 True False False\n", + "4 374 True False False\n", + ".. ... ... ... ...\n", + "508 48 True False False\n", + "509 108 True False False\n", + "510 484 True False False\n", + "511 406 True False False\n", + "512 152 True False False\n", + "\n", + "[513 rows x 4 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unpersist(subgraph_partition)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d09b740a-eaaf-4fac-a73f-0bd9bb671f70", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtypesource_node_idtarget_node_idfile_idmentioned_in
00subword6804849684489NaN
11name496843206248920349.0
22name_rev320624968448920349.0
33names320627539848920349.0
44names_rev753983206248920349.0
55subword4890819829489NaN
66name198297069948920349.0
77name_rev706991982948920349.0
88names706997539848920349.0
99names_rev753987069948920349.0
10106448args_rev321333733631937944.0
11106449executed_while_true321333471731937944.0
12106450executed_while_true_rev347173213331937944.0
13106451target477024948531937944.0
14106452target_rev494854770231937944.0
15106453op239174948531937944.0
16106454value499524948531937944.0
17106456next321334948531937944.0
18106457prev494853213331937944.0
19106458executed_while_true494853471731937944.0
20212803left559976382938752804.0
21212804left_rev638295599738752804.0
22212805right499526382938752804.0
23212807op239176382938752804.0
24212808args638292197638752804.0
25212809args_rev219766382938752804.0
26212810test428745612338752804.0
27212811test_rev561234287438752804.0
28212812value499522126038752804.0
29212814attr28302126038752804.0
\n", + "
" + ], + "text/plain": [ + " id type source_node_id target_node_id file_id \\\n", + "0 0 subword 68048 49684 489 \n", + "1 1 name 49684 32062 489 \n", + "2 2 name_rev 32062 49684 489 \n", + "3 3 names 32062 75398 489 \n", + "4 4 names_rev 75398 32062 489 \n", + "5 5 subword 48908 19829 489 \n", + "6 6 name 19829 70699 489 \n", + "7 7 name_rev 70699 19829 489 \n", + "8 8 names 70699 75398 489 \n", + "9 9 names_rev 75398 70699 489 \n", + "10 106448 args_rev 32133 37336 319 \n", + "11 106449 executed_while_true 32133 34717 319 \n", + "12 106450 executed_while_true_rev 34717 32133 319 \n", + "13 106451 target 47702 49485 319 \n", + "14 106452 target_rev 49485 47702 319 \n", + "15 106453 op 23917 49485 319 \n", + "16 106454 value 49952 49485 319 \n", + "17 106456 next 32133 49485 319 \n", + "18 106457 prev 49485 32133 319 \n", + "19 106458 executed_while_true 49485 34717 319 \n", + "20 212803 left 55997 63829 387 \n", + "21 212804 left_rev 63829 55997 387 \n", + "22 212805 right 49952 63829 387 \n", + "23 212807 op 23917 63829 387 \n", + "24 212808 args 63829 21976 387 \n", + "25 212809 args_rev 21976 63829 387 \n", + "26 212810 test 42874 56123 387 \n", + "27 212811 test_rev 56123 42874 387 \n", + "28 212812 value 49952 21260 387 \n", + "29 212814 attr 2830 21260 387 \n", + "\n", + " mentioned_in \n", + "0 NaN \n", + "1 20349.0 \n", + "2 20349.0 \n", + "3 20349.0 \n", + "4 20349.0 \n", + "5 NaN \n", + "6 20349.0 \n", + "7 20349.0 \n", + "8 20349.0 \n", + "9 20349.0 \n", + "10 37944.0 \n", + "11 37944.0 \n", + "12 37944.0 \n", + "13 37944.0 \n", + "14 37944.0 \n", + "15 37944.0 \n", + "16 37944.0 \n", + "17 37944.0 \n", + "18 37944.0 \n", + "19 37944.0 \n", + "20 52804.0 \n", + "21 52804.0 \n", + "22 52804.0 \n", + "23 52804.0 \n", + "24 52804.0 \n", + "25 52804.0 \n", + "26 52804.0 \n", + "27 52804.0 \n", + "28 52804.0 \n", + "29 52804.0 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unpersist(join(data_path, \"common_edges.json.bz2\"), nrows=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d38f83db", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "config = get_config(\n", + " subgraph_id_column=\"file_id\",\n", + " data_path=data_path,\n", + " model_output_dir=data_path,\n", + " subgraph_partition=subgraph_partition,\n", + " tokenizer_path=tokenizer_path,\n", + " objectives=\"subgraph_clf\",\n", + " measure_scores=True,\n", + " dilate_scores=1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "bf61e1a2-2a2b-4688-b6b6-4ad3c22b22a5", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# save_config(config, \"var_misuse_tiny.yaml\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "07dc7b86", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'DATASET': {'data_path': 'scaa_9_tasks_python_graph',\n", + " 'train_frac': 0.9,\n", + " 'filter_edges': None,\n", + " 'min_count_for_objectives': 5,\n", + " 'self_loops': False,\n", + " 'use_node_types': False,\n", + " 'use_edge_types': False,\n", + " 'no_global_edges': False,\n", + " 'remove_reverse': False,\n", + " 'custom_reverse': None,\n", + " 'restricted_id_pool': None,\n", + " 'random_seed': None,\n", + " 'subgraph_id_column': 'file_id',\n", + " 'subgraph_partition': 'scaa_9_tasks_python_graph/subgraph_partition.json.bz2'},\n", + " 'TRAINING': {'model_output_dir': 'scaa_9_tasks_python_graph',\n", + " 'pretrained': None,\n", + " 'pretraining_phase': 0,\n", + " 'sampling_neighbourhood_size': 10,\n", + " 'neg_sampling_factor': 3,\n", + " 'use_layer_scheduling': False,\n", + " 'schedule_layers_every': 10,\n", + " 'elem_emb_size': 100,\n", + " 'embedding_table_size': 200000,\n", + " 'epochs': 100,\n", + " 'batch_size': 128,\n", + " 'learning_rate': 0.001,\n", + " 'objectives': 'subgraph_clf',\n", + " 'save_each_epoch': False,\n", + " 'save_checkpoints': True,\n", + " 'early_stopping': False,\n", + " 'early_stopping_tolerance': 20,\n", + " 'force_w2v_ns': False,\n", + " 'use_ns_groups': False,\n", + " 'nn_index': 'brute',\n", + " 'metric': 'inner_prod',\n", + " 'measure_scores': False,\n", + " 'dilate_scores': 1,\n", + " 'gpu': -1,\n", + " 'external_dataset': None,\n", + " 'restore_state': False},\n", + " 'MODEL': {'node_emb_size': 100,\n", + " 'h_dim': 100,\n", + " 'n_layers': 5,\n", + " 'use_self_loop': True,\n", + " 'use_gcn_checkpoint': False,\n", + " 'use_att_checkpoint': False,\n", + " 'use_gru_checkpoint': False,\n", + " 'num_bases': 10,\n", + " 'dropout': 0.0,\n", + " 'activation': 'tanh'},\n", + " 'TOKENIZER': {'tokenizer_path': 'sentencepiece_bpe.model'}}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "config" + ] + }, + { + "cell_type": "markdown", + "id": "069a2528", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# Create Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d0ce29fc", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "dataset = SourceGraphDataset(\n", + " **{**config[\"DATASET\"], **config[\"TOKENIZER\"]}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "897d41bf", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# Declare target loading function (labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "c279f43e", + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def load_labels():\n", + " filecontent = unpersist(filecontent_path)\n", + " return filecontent[[\"id\", \"user\"]].rename({\"id\": \"src\", \"user\": \"dst\"}, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "717a502c-4399-4e15-a83a-4f16c8e9639c", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
srcdst
0489thekushalghosh
1474shubhi_
2310briangodwinlim
380Itachi_uchiha
4374erdnase
.........
50848DavidEdey
509108KimJohnWu
510484spelvin
511406luctchak
512152Pedastrian
\n", + "

513 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " src dst\n", + "0 489 thekushalghosh\n", + "1 474 shubhi_\n", + "2 310 briangodwinlim\n", + "3 80 Itachi_uchiha\n", + "4 374 erdnase\n", + ".. ... ...\n", + "508 48 DavidEdey\n", + "509 108 KimJohnWu\n", + "510 484 spelvin\n", + "511 406 luctchak\n", + "512 152 Pedastrian\n", + "\n", + "[513 rows x 2 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "load_labels()" + ] + }, + { + "cell_type": "markdown", + "id": "1ac27d90", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "One or several objectives could be used" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "67605a81", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "class Trainer(SamplingMultitaskTrainer):\n", + " def create_objectives(self, dataset, tokenizer_path):\n", + " self.objectives = nn.ModuleList()\n", + " \n", + " self.objectives.append(\n", + " self._create_subgraph_objective(\n", + " objective_name=\"SCAAMatching\",\n", + " objective_class=SubgraphMatchingObjective,\n", + " dataset=dataset,\n", + " tokenizer_path=tokenizer_path,\n", + " labels_fn=load_labels,\n", + " )\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "393251c2", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%tensorboard --logdir data_path" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "369ed1bb", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of nodes 80375\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Precompute Target Embeddings: 0%| | 0/1 [00:00\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtraining_procedure\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mRGGAN\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mMODEL\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrainer_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mTRAINING\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel_base_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mget_model_base\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mTRAINING\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mget_name\u001b[49m\u001b[43m(\u001b[49m\u001b[43mRGGAN\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mdatetime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnow\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mTrainer\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/dev/method-embeddings/SourceCodeTools/models/graph/train/sampling_multitask2.py:794\u001b[0m, in \u001b[0;36mtraining_procedure\u001b[0;34m(dataset, model_name, model_params, trainer_params, model_base_path, tokenizer_path, trainer, load_external_dataset)\u001b[0m\n\u001b[1;32m 781\u001b[0m trainer \u001b[38;5;241m=\u001b[39m trainer(\n\u001b[1;32m 782\u001b[0m dataset\u001b[38;5;241m=\u001b[39mdataset,\n\u001b[1;32m 783\u001b[0m model_name\u001b[38;5;241m=\u001b[39mmodel_name,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 790\u001b[0m load_external_dataset\u001b[38;5;241m=\u001b[39mload_external_dataset\n\u001b[1;32m 791\u001b[0m )\n\u001b[1;32m 793\u001b[0m \u001b[38;5;66;03m# try:\u001b[39;00m\n\u001b[0;32m--> 794\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain_all\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 795\u001b[0m \u001b[38;5;66;03m# except KeyboardInterrupt:\u001b[39;00m\n\u001b[1;32m 796\u001b[0m \u001b[38;5;66;03m# logging.info(\"Training interrupted\")\u001b[39;00m\n\u001b[1;32m 797\u001b[0m \u001b[38;5;66;03m# except EarlyStopping:\u001b[39;00m\n\u001b[1;32m 798\u001b[0m \u001b[38;5;66;03m# logging.info(\"Early stopping triggered\")\u001b[39;00m\n\u001b[1;32m 799\u001b[0m \u001b[38;5;66;03m# except Exception as e:\u001b[39;00m\n\u001b[1;32m 800\u001b[0m \u001b[38;5;66;03m# print(\"There was an exception\", e)\u001b[39;00m\n\u001b[1;32m 802\u001b[0m trainer\u001b[38;5;241m.\u001b[39meval()\n", + "File \u001b[0;32m~/dev/method-embeddings/SourceCodeTools/models/graph/train/sampling_multitask2.py:567\u001b[0m, in \u001b[0;36mSamplingMultitaskTrainer.train_all\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 564\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mset_grad_enabled(\u001b[38;5;28;01mFalse\u001b[39;00m):\n\u001b[1;32m 565\u001b[0m objective\u001b[38;5;241m.\u001b[39mtarget_embedder\u001b[38;5;241m.\u001b[39mprepare_index() \u001b[38;5;66;03m# need this to update sampler for the next epoch\u001b[39;00m\n\u001b[0;32m--> 567\u001b[0m val_scores \u001b[38;5;241m=\u001b[39m \u001b[43mobjective\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mevaluate\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mval\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 568\u001b[0m test_scores \u001b[38;5;241m=\u001b[39m objective\u001b[38;5;241m.\u001b[39mevaluate(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 570\u001b[0m summary \u001b[38;5;241m=\u001b[39m {}\n", + "File \u001b[0;32m~/dev/method-embeddings/SourceCodeTools/models/graph/train/objectives/AbstractObjective.py:746\u001b[0m, in \u001b[0;36mAbstractObjective.evaluate\u001b[0;34m(self, data_split, neg_sampling_strategy, early_stopping, early_stopping_tolerance)\u001b[0m\n\u001b[1;32m 744\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mevaluate\u001b[39m(\u001b[38;5;28mself\u001b[39m, data_split, \u001b[38;5;241m*\u001b[39m, neg_sampling_strategy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, early_stopping\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, early_stopping_tolerance\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m20\u001b[39m):\n\u001b[1;32m 745\u001b[0m \u001b[38;5;66;03m# negative factor is 1 for evaluation\u001b[39;00m\n\u001b[0;32m--> 746\u001b[0m scores \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mevaluate_objective\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata_split\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mneg_sampling_strategy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnegative_factor\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m data_split \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mval\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 748\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcheck_early_stopping(scores[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAccuracy\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n", + "File \u001b[0;32m~/dev/method-embeddings/SourceCodeTools/models/graph/train/objectives/SubgraphClassifierObjective.py:238\u001b[0m, in \u001b[0;36mSubgraphAbstractObjective.evaluate_objective\u001b[0;34m(self, data_split, neg_sampling_strategy, negative_factor)\u001b[0m\n\u001b[1;32m 235\u001b[0m masked \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmasker\u001b[38;5;241m.\u001b[39mget_mask(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mseeds_to_python(seeds))\n\u001b[1;32m 237\u001b[0m src_embs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_graph_embeddings(input_nodes, blocks, masked\u001b[38;5;241m=\u001b[39mmasked, subgraph_masks\u001b[38;5;241m=\u001b[39msubgraph_masks)\n\u001b[0;32m--> 238\u001b[0m node_embs_, element_embs_, labels \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprepare_for_prediction\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 239\u001b[0m \u001b[43m \u001b[49m\u001b[43msrc_embs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mseeds\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget_embedding_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnegative_factor\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnegative_factor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 240\u001b[0m \u001b[43m \u001b[49m\u001b[43mneg_sampling_strategy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mneg_sampling_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 241\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrain_embeddings\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[1;32m 242\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 244\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmeasure_scores:\n\u001b[1;32m 245\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m count \u001b[38;5;241m%\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdilate_scores \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n", + "File \u001b[0;32m~/dev/method-embeddings/SourceCodeTools/models/graph/train/objectives/AbstractObjective.py:490\u001b[0m, in \u001b[0;36mAbstractObjective.prepare_for_prediction\u001b[0;34m(self, node_embeddings, seeds, target_embedding_fn, negative_factor, neg_sampling_strategy, train_embeddings)\u001b[0m\n\u001b[1;32m 485\u001b[0m positive_indices \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtarget_embedder[indices]\n\u001b[1;32m 486\u001b[0m negative_indices \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msample_negative(\n\u001b[1;32m 487\u001b[0m k\u001b[38;5;241m=\u001b[39mbatch_size \u001b[38;5;241m*\u001b[39m k, ids\u001b[38;5;241m=\u001b[39mindices, neg_sampling_strategy\u001b[38;5;241m=\u001b[39mneg_sampling_strategy\n\u001b[1;32m 488\u001b[0m )\n\u001b[0;32m--> 490\u001b[0m positive_dst, negative_dst \u001b[38;5;241m=\u001b[39m \u001b[43mtarget_embedding_fn\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 491\u001b[0m \u001b[43m \u001b[49m\u001b[43mpositive_indices\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnegative_indices\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrain_embeddings\u001b[49m\n\u001b[1;32m 492\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 494\u001b[0m \u001b[38;5;66;03m# TODO breaks cache in\u001b[39;00m\n\u001b[1;32m 495\u001b[0m \u001b[38;5;66;03m# SourceCodeTools.models.graph.train.objectives.GraphLinkClassificationObjective.TargetLinkMapper.get_labels\u001b[39;00m\n\u001b[1;32m 496\u001b[0m labels_pos \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcreate_positive_labels(indices)\n", + "File \u001b[0;32m~/dev/method-embeddings/SourceCodeTools/models/graph/train/objectives/SubgraphEmbedderObjective.py:101\u001b[0m, in \u001b[0;36mget_targets_from_subgraph\u001b[0;34m(self, positive_indices, negative_indices, train_embeddings)\u001b[0m\n\u001b[1;32m 99\u001b[0m \n\u001b[1;32m 100\u001b[0m positive_dst = prepare(positive_indices)\n\u001b[0;32m--> 101\u001b[0m negative_dst = prepare(negative_indices) if negative_indices is not None else None\n\u001b[1;32m 102\u001b[0m \n\u001b[1;32m 103\u001b[0m return positive_dst, negative_dst\n", + "File \u001b[0;32m~/dev/method-embeddings/SourceCodeTools/models/graph/train/objectives/SubgraphEmbedderObjective.py:95\u001b[0m, in \u001b[0;36mprepare\u001b[0;34m(indices)\u001b[0m\n\u001b[1;32m 93\u001b[0m def prepare(indices):\n\u001b[1;32m 94\u001b[0m indices_c = compactify(indices)\n\u001b[0;32m---> 95\u001b[0m embs = get_embeddings(self.train_loader.load_ids(indices_c))\n\u001b[1;32m 96\u001b[0m if len(indices_c) != len(indices):\n\u001b[1;32m 97\u001b[0m embs = restore(indices, indices_c, embs)\n", + "File \u001b[0;32m~/dev/method-embeddings/SourceCodeTools/models/graph/train/objectives/SubgraphEmbedderObjective.py:78\u001b[0m, in \u001b[0;36mget_embeddings\u001b[0;34m(loader)\u001b[0m\n\u001b[1;32m 76\u001b[0m ):\n\u001b[1;32m 77\u001b[0m def get_embeddings(loader):\n\u001b[0;32m---> 78\u001b[0m input_nodes, seeds, blocks = next(loader)\n\u001b[1;32m 79\u001b[0m subgraph_masks, seeds = seeds\n\u001b[1;32m 80\u001b[0m masked = self.masker.get_mask(self.seeds_to_python(seeds)) if self.masker is not None else None\n", + "File \u001b[0;32m~/dev/method-embeddings/SourceCodeTools/models/graph/train/objectives/SubgraphClassifierObjective.py:54\u001b[0m, in \u001b[0;36mSubgraphLoader.load_ids\u001b[0;34m(self, batch_ids)\u001b[0m\n\u001b[1;32m 52\u001b[0m subgraph_nodes \u001b[38;5;241m=\u001b[39m subgraph[type_]\n\u001b[1;32m 53\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m node_id \u001b[38;5;129;01min\u001b[39;00m node_ids[type_]:\n\u001b[0;32m---> 54\u001b[0m \u001b[43mcoincidence_matrix\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mappend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnode_id\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43msubgraph_nodes\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 56\u001b[0m coincidence_matrix \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mBoolTensor(coincidence_matrix)\n\u001b[1;32m 58\u001b[0m loader \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mloading_fn(node_ids)\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "training_procedure(\n", + " dataset, \n", + " model_name=RGGAN, \n", + " model_params=config[\"MODEL\"],\n", + " trainer_params=config[\"TRAINING\"],\n", + " model_base_path=get_model_base(config[\"TRAINING\"], get_name(RGGAN, str(datetime.now()))),\n", + " trainer=Trainer\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "879cf5e3", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b91dd03", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/Subgraph Classification.ipynb b/examples/Subgraph Classification.ipynb new file mode 100644 index 00000000..b65cbe14 --- /dev/null +++ b/examples/Subgraph Classification.ipynb @@ -0,0 +1,701 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "244ac956", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%load_ext tensorboard" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bdefc65e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using backend: pytorch\n" + ] + } + ], + "source": [ + "from random import random\n", + "\n", + "from SourceCodeTools.models.training_config import get_config, save_config, load_config\n", + "from SourceCodeTools.code.data.dataset.Dataset import SourceGraphDataset, filter_dst_by_freq\n", + "from SourceCodeTools.models.graph.train.sampling_multitask2 import training_procedure, SamplingMultitaskTrainer\n", + "from SourceCodeTools.models.graph.train.objectives.NodeClassificationObjective import NodeClassifierObjective\n", + "from SourceCodeTools.models.graph.train.objectives.SubgraphClassifierObjective import SubgraphAbstractObjective, \\\n", + " SubgraphClassifierObjective, SubgraphEmbeddingObjective\n", + "from SourceCodeTools.models.graph.train.utils import get_name, get_model_base\n", + "from SourceCodeTools.models.graph import RGGAN\n", + "from SourceCodeTools.tabular.common import compact_property\n", + "from SourceCodeTools.code.data.file_utils import unpersist\n", + "\n", + "import dgl\n", + "import torch\n", + "import numpy as np\n", + "from argparse import Namespace\n", + "from torch import nn\n", + "from datetime import datetime\n", + "from os.path import join\n", + "from functools import partial" + ] + }, + { + "cell_type": "markdown", + "id": "4becf482", + "metadata": { + "tags": [] + }, + "source": [ + "# Prepare parameters and options\n", + "\n", + "Full list of options that can be added can be found in `SourceCodeTools/models/training_options.py`. They are ment to be used as arguments for cli trainer. Trainer script can be found in `SourceCodeTools/scripts/train.py`.\n", + "\n", + "For the task of subgraph classification the important options are:\n", + "- `subgraph_partition` is path to subgraph-based train/val/test sets. Storead as Dataframe with subgraph id and partition mask\n", + "- `subgraph_id_column` is a column is `common_edges` file that stores subgraph id.\n", + "- For variable misuse task (same will apply to authorship attribution) subgraphs are created for individual functions (files for SCAA). The label is stored in `common_filecontent`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5637f70e-b8fd-4a9f-9956-d60a61bd2870", + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer_path = \"sentencepiece_bpe.model\"\n", + "\n", + "data_path = \"cubert_varmisuse_tiny\"\n", + "subgraph_partition = join(data_path, \"partition.json.bz2\")\n", + "filecontent_path = join(data_path, \"common_filecontent.json.bz2\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "404565f5-09a3-4535-ba51-8533491f7c72", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtrain_maskval_masktest_mask
05040TrueFalseFalse
19406TrueFalseFalse
29720TrueFalseFalse
35332TrueFalseFalse
46923TrueFalseFalse
...............
23087993090FalseFalseTrue
23088998594FalseFalseTrue
23089998064FalseFalseTrue
23090997465FalseFalseTrue
23091994477FalseFalseTrue
\n", + "

23092 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " id train_mask val_mask test_mask\n", + "0 5040 True False False\n", + "1 9406 True False False\n", + "2 9720 True False False\n", + "3 5332 True False False\n", + "4 6923 True False False\n", + "... ... ... ... ...\n", + "23087 993090 False False True\n", + "23088 998594 False False True\n", + "23089 998064 False False True\n", + "23090 997465 False False True\n", + "23091 994477 False False True\n", + "\n", + "[23092 rows x 4 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unpersist(subgraph_partition)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "d09b740a-eaaf-4fac-a73f-0bd9bb671f70", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtypesource_node_idtarget_node_idfile_idmentioned_in
00subword4831744250645040NaN
11arg425064971245040142323.0
22arg_rev971244250645040142323.0
33args971242487515040142323.0
44args_rev248751971245040142323.0
55args2487511423235040142323.0
66args_rev1423232487515040142323.0
77subword7861981432875040NaN
88decorator_list1432871423235040142323.0
99decorator_list_rev1423231432875040142323.0
\n", + "
" + ], + "text/plain": [ + " id type source_node_id target_node_id file_id \\\n", + "0 0 subword 483174 425064 5040 \n", + "1 1 arg 425064 97124 5040 \n", + "2 2 arg_rev 97124 425064 5040 \n", + "3 3 args 97124 248751 5040 \n", + "4 4 args_rev 248751 97124 5040 \n", + "5 5 args 248751 142323 5040 \n", + "6 6 args_rev 142323 248751 5040 \n", + "7 7 subword 786198 143287 5040 \n", + "8 8 decorator_list 143287 142323 5040 \n", + "9 9 decorator_list_rev 142323 143287 5040 \n", + "\n", + " mentioned_in \n", + "0 NaN \n", + "1 142323.0 \n", + "2 142323.0 \n", + "3 142323.0 \n", + "4 142323.0 \n", + "5 142323.0 \n", + "6 142323.0 \n", + "7 NaN \n", + "8 142323.0 \n", + "9 142323.0 " + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unpersist(join(data_path, \"common_edges.json.bz2\"), nrows=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d38f83db", + "metadata": {}, + "outputs": [], + "source": [ + "config = get_config(\n", + " subgraph_id_column=\"file_id\",\n", + " data_path=data_path,\n", + " model_output_dir=data_path,\n", + " subgraph_partition=subgraph_partition,\n", + " tokenizer_path=tokenizer_path,\n", + " objectives=\"subgraph_clf\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "bf61e1a2-2a2b-4688-b6b6-4ad3c22b22a5", + "metadata": {}, + "outputs": [], + "source": [ + "save_config(config, \"var_misuse_tiny.yaml\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "07dc7b86", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'DATASET': {'data_path': 'cubert_varmisuse_tiny',\n", + " 'train_frac': 0.9,\n", + " 'filter_edges': None,\n", + " 'min_count_for_objectives': 5,\n", + " 'self_loops': False,\n", + " 'use_node_types': False,\n", + " 'use_edge_types': False,\n", + " 'no_global_edges': False,\n", + " 'remove_reverse': False,\n", + " 'custom_reverse': None,\n", + " 'restricted_id_pool': None,\n", + " 'random_seed': None,\n", + " 'subgraph_id_column': 'file_id',\n", + " 'subgraph_partition': 'cubert_varmisuse_tiny/partition.json.bz2'},\n", + " 'TRAINING': {'model_output_dir': 'cubert_varmisuse_tiny',\n", + " 'pretrained': None,\n", + " 'pretraining_phase': 0,\n", + " 'sampling_neighbourhood_size': 10,\n", + " 'neg_sampling_factor': 3,\n", + " 'use_layer_scheduling': False,\n", + " 'schedule_layers_every': 10,\n", + " 'elem_emb_size': 100,\n", + " 'embedding_table_size': 200000,\n", + " 'epochs': 100,\n", + " 'batch_size': 128,\n", + " 'learning_rate': 0.001,\n", + " 'objectives': 'subgraph_clf',\n", + " 'save_each_epoch': False,\n", + " 'save_checkpoints': True,\n", + " 'early_stopping': False,\n", + " 'early_stopping_tolerance': 20,\n", + " 'force_w2v_ns': False,\n", + " 'use_ns_groups': False,\n", + " 'nn_index': 'brute',\n", + " 'metric': 'inner_prod',\n", + " 'measure_scores': False,\n", + " 'dilate_scores': 200,\n", + " 'gpu': -1,\n", + " 'external_dataset': None,\n", + " 'restore_state': False},\n", + " 'MODEL': {'node_emb_size': 100,\n", + " 'h_dim': 100,\n", + " 'n_layers': 5,\n", + " 'use_self_loop': True,\n", + " 'use_gcn_checkpoint': False,\n", + " 'use_att_checkpoint': False,\n", + " 'use_gru_checkpoint': False,\n", + " 'num_bases': 10,\n", + " 'dropout': 0.0,\n", + " 'activation': 'tanh'},\n", + " 'TOKENIZER': {'tokenizer_path': 'sentencepiece_bpe.model'}}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "config" + ] + }, + { + "cell_type": "markdown", + "id": "069a2528", + "metadata": {}, + "source": [ + "# Create Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "d0ce29fc", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = SourceGraphDataset(\n", + " **{**config[\"DATASET\"], **config[\"TOKENIZER\"]}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "897d41bf", + "metadata": {}, + "source": [ + "# Declare target loading function (labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "c279f43e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def load_labels():\n", + " filecontent = unpersist(filecontent_path)\n", + " return filecontent[[\"id\", \"label\"]].rename({\"id\": \"src\", \"label\": \"dst\"}, axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "1ac27d90", + "metadata": {}, + "source": [ + "One or several objectives could be used" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "67605a81", + "metadata": {}, + "outputs": [], + "source": [ + "class Trainer(SamplingMultitaskTrainer):\n", + " def create_objectives(self, dataset, tokenizer_path):\n", + " self.objectives = nn.ModuleList()\n", + " \n", + " self.objectives.append(\n", + " self._create_subgraph_objective(\n", + " objective_name=\"VariableMisuseClf\",\n", + " objective_class=SubgraphClassifierObjective,\n", + " dataset=dataset,\n", + " tokenizer_path=tokenizer_path,\n", + " labels_fn=load_labels,\n", + " )\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "393251c2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%tensorboard --logdir data_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "369ed1bb", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Epoch 0: 0%| | 0/110 [00:00 str:\n \"\"\"\n Call another method. \u0412\u044b\u0437\u043e\u0432 \u0434\u0440\u0443\u0433\u043e\u0433\u043e \u043c\u0435\u0442\u043e\u0434\u0430.\n :return:\n \"\"\"\n return self.method2()\n\n def method2(self) -> str:\n \"\"\"\n Simple operations.\n \u041f\u0440\u043e\u0441\u0442\u044b\u0435 \u043e\u043f\u0435\u0440\u0430\u0446\u0438\u0438.\n :return:\n \"\"\"\n variable1: int = self.field\n variable2: str = str(variable1)\n return variable2"} +{"id":18,"type":"Module","serialized_name":"Module_0x16d41315b631c9ef","mentioned_in":null,"string":null} +{"id":19,"type":"arg","serialized_name":"arg_0x16d41315b616478a","mentioned_in":16,"string":"self"} +{"id":20,"type":"arguments","serialized_name":"arguments_0x16d41315b6320346","mentioned_in":16,"string":null} +{"id":21,"type":"subword","serialized_name":"\u2581argument","mentioned_in":null,"string":null} +{"id":22,"type":"mention","serialized_name":"argument@FunctionDef_0x16d41315b61e0b59","mentioned_in":16,"string":null} +{"id":23,"type":"arg","serialized_name":"arg_0x16d41315b6854c98","mentioned_in":16,"string":"argument: int"} +{"id":26,"type":"Assign","serialized_name":"Assign_0x16d41315b65e0868","mentioned_in":16,"string":"self.field = argument"} +{"id":27,"type":"Attribute","serialized_name":"Attribute_0x16d41315b6d16a6d","mentioned_in":16,"string":"self.field"} +{"id":28,"type":"#attr#","serialized_name":"field","mentioned_in":null,"string":null} +{"id":30,"type":"subword","serialized_name":"\u2581__","mentioned_in":null,"string":null} +{"id":31,"type":"mention","serialized_name":"__init__@ClassDef_0x16d41315b60713c7","mentioned_in":17,"string":null} +{"id":32,"type":"subword","serialized_name":"init","mentioned_in":null,"string":null} +{"id":33,"type":"subword","serialized_name":"__","mentioned_in":null,"string":null} +{"id":34,"type":"mention","serialized_name":"self@FunctionDef_0x16d41315b69b5874","mentioned_in":35,"string":null} +{"id":35,"type":"FunctionDef","serialized_name":"FunctionDef_0x16d41315b69b5874","mentioned_in":17,"string":"def method1(self) -> str:\n \"\"\"\n Call another method. \u0412\u044b\u0437\u043e\u0432 \u0434\u0440\u0443\u0433\u043e\u0433\u043e \u043c\u0435\u0442\u043e\u0434\u0430.\n :return:\n \"\"\"\n return self.method2()"} +{"id":36,"type":"arg","serialized_name":"arg_0x16d41315b6881089","mentioned_in":35,"string":"self"} +{"id":37,"type":"arguments","serialized_name":"arguments_0x16d41315b64e4b84","mentioned_in":35,"string":null} +{"id":39,"type":"subword","serialized_name":"\u2581str","mentioned_in":null,"string":null} +{"id":40,"type":"Attribute","serialized_name":"Attribute_0x16d41315b6633cfb","mentioned_in":35,"string":"self.method2"} +{"id":41,"type":"#attr#","serialized_name":"method2","mentioned_in":null,"string":null} +{"id":42,"type":"subword","serialized_name":"\u2581method","mentioned_in":null,"string":null} +{"id":43,"type":"subword","serialized_name":"2","mentioned_in":null,"string":null} +{"id":44,"type":"Call","serialized_name":"Call_0x16d41315b6f8b441","mentioned_in":35,"string":"self.method2()"} +{"id":45,"type":"Return","serialized_name":"Return_0x16d41315b6887186","mentioned_in":35,"string":"return self.method2()"} +{"id":46,"type":"mention","serialized_name":"method1@ClassDef_0x16d41315b60713c7","mentioned_in":17,"string":null} +{"id":47,"type":"subword","serialized_name":"1","mentioned_in":null,"string":null} +{"id":48,"type":"mention","serialized_name":"self@FunctionDef_0x16d41315b6ad90da","mentioned_in":49,"string":null} +{"id":49,"type":"FunctionDef","serialized_name":"FunctionDef_0x16d41315b6ad90da","mentioned_in":17,"string":"def method2(self) -> str:\n \"\"\"\n Simple operations.\n \u041f\u0440\u043e\u0441\u0442\u044b\u0435 \u043e\u043f\u0435\u0440\u0430\u0446\u0438\u0438.\n :return:\n \"\"\"\n variable1: int = self.field\n variable2: str = str(variable1)\n return variable2"} +{"id":50,"type":"arg","serialized_name":"arg_0x16d41315b69c8916","mentioned_in":49,"string":"self"} +{"id":51,"type":"arguments","serialized_name":"arguments_0x16d41315b685c455","mentioned_in":49,"string":null} +{"id":52,"type":"subword","serialized_name":"\u2581variable","mentioned_in":null,"string":null} +{"id":53,"type":"mention","serialized_name":"variable1@FunctionDef_0x16d41315b6ad90da","mentioned_in":49,"string":null} +{"id":54,"type":"AnnAssign","serialized_name":"AnnAssign_0x16d41315b69586f8","mentioned_in":49,"string":"variable1: int = self.field"} +{"id":55,"type":"Attribute","serialized_name":"Attribute_0x16d41315b692f3a5","mentioned_in":49,"string":"self.field"} +{"id":56,"type":"mention","serialized_name":"variable2@FunctionDef_0x16d41315b6ad90da","mentioned_in":49,"string":null} +{"id":57,"type":"AnnAssign","serialized_name":"AnnAssign_0x16d41315b64fbc69","mentioned_in":49,"string":"variable2: str = str(variable1)"} +{"id":58,"type":"mention","serialized_name":"str@FunctionDef_0x16d41315b6ad90da","mentioned_in":49,"string":null} +{"id":59,"type":"Call","serialized_name":"Call_0x16d41315b6720a4f","mentioned_in":49,"string":"str(variable1)"} +{"id":60,"type":"Return","serialized_name":"Return_0x16d41315b639dcda","mentioned_in":49,"string":"return variable2"} +{"id":61,"type":"mention","serialized_name":"method2@ClassDef_0x16d41315b60713c7","mentioned_in":17,"string":null} +{"id":62,"type":"subword","serialized_name":"\u2581Example","mentioned_in":null,"string":null} +{"id":63,"type":"mention","serialized_name":"ExampleClass@Module_0x16d41315b631c9ef","mentioned_in":18,"string":null} +{"id":64,"type":"subword","serialized_name":"Class","mentioned_in":null,"string":null} +{"id":65,"type":"mention","serialized_name":"ExampleModule@Module_0x16d41315b7d4afab","mentioned_in":66,"string":null} +{"id":66,"type":"Module","serialized_name":"Module_0x16d41315b7d4afab","mentioned_in":null,"string":null} +{"id":67,"type":"subword","serialized_name":"Module","mentioned_in":null,"string":null} +{"id":68,"type":"ImportFrom","serialized_name":"ImportFrom_0x16d41315b71ee551","mentioned_in":66,"string":"from ExampleModule import ExampleClass as EC"} +{"id":69,"type":"mention","serialized_name":"ExampleClass@Module_0x16d41315b7d4afab","mentioned_in":66,"string":null} +{"id":70,"type":"alias","serialized_name":"alias_0x16d41315b78c6d18","mentioned_in":66,"string":null} +{"id":71,"type":"subword","serialized_name":"\u2581EC","mentioned_in":null,"string":null} +{"id":72,"type":"mention","serialized_name":"EC@Module_0x16d41315b7d4afab","mentioned_in":66,"string":null} +{"id":73,"type":"Call","serialized_name":"Call_0x16d41315b75ea742","mentioned_in":66,"string":"EC(5)"} +{"id":74,"type":"Constant","serialized_name":"Constant_","mentioned_in":null,"string":null} +{"id":75,"type":"Assign","serialized_name":"Assign_0x16d41315b72b1e6c","mentioned_in":66,"string":"instance = EC(5)"} +{"id":76,"type":"subword","serialized_name":"\u2581instance","mentioned_in":null,"string":null} +{"id":77,"type":"mention","serialized_name":"instance@Module_0x16d41315b7d4afab","mentioned_in":66,"string":null} +{"id":79,"type":"FunctionDef","serialized_name":"FunctionDef_0x16d41315b735b6ae","mentioned_in":66,"string":"def main() -> None:\n print(instance.method1())"} +{"id":81,"type":"subword","serialized_name":"\u2581print","mentioned_in":null,"string":null} +{"id":82,"type":"mention","serialized_name":"print@FunctionDef_0x16d41315b735b6ae","mentioned_in":79,"string":null} +{"id":83,"type":"Call","serialized_name":"Call_0x16d41315b795d9b1","mentioned_in":79,"string":"print(instance.method1())"} +{"id":84,"type":"mention","serialized_name":"instance@FunctionDef_0x16d41315b735b6ae","mentioned_in":79,"string":null} +{"id":85,"type":"Attribute","serialized_name":"Attribute_0x16d41315b7d4d09e","mentioned_in":79,"string":"instance.method1"} +{"id":86,"type":"#attr#","serialized_name":"method1","mentioned_in":null,"string":null} +{"id":87,"type":"Call","serialized_name":"Call_0x16d41315b7d93dbb","mentioned_in":79,"string":"instance.method1()"} +{"id":88,"type":"subword","serialized_name":"\u2581main","mentioned_in":null,"string":null} +{"id":89,"type":"mention","serialized_name":"main@Module_0x16d41315b7d4afab","mentioned_in":66,"string":null} +{"id":90,"type":"Call","serialized_name":"Call_0x16d41315b738fe7d","mentioned_in":66,"string":"main()"} +{"id":91,"type":"module","serialized_name":"Module","mentioned_in":null,"string":null} +{"id":92,"type":"class","serialized_name":"Module.Number","mentioned_in":null,"string":null} +{"id":93,"type":"class_method","serialized_name":"Module.Number.__init__","mentioned_in":null,"string":null} +{"id":94,"type":"class_field","serialized_name":"Module.Number.val","mentioned_in":null,"string":null} +{"id":95,"type":"function","serialized_name":"Module.Number.__add__","mentioned_in":null,"string":null} +{"id":96,"type":"function","serialized_name":"Module.Number.__repr__","mentioned_in":null,"string":null} +{"id":97,"type":"mention","serialized_name":"self@FunctionDef_0x16d41315c8deaa67","mentioned_in":98,"string":null} +{"id":98,"type":"FunctionDef","serialized_name":"FunctionDef_0x16d41315c8deaa67","mentioned_in":99,"string":"def __init__(self, value: int):\n \"\"\"\n Initialize. \u0418\u043d\u0438\u0446\u0438\u0430\u043b\u0438\u0437\u0430\u0446\u0438\u044f\n :param argument:\n \"\"\"\n self.val = value"} +{"id":99,"type":"ClassDef","serialized_name":"ClassDef_0x16d41315c87e23dc","mentioned_in":100,"string":"class Number:\n def __init__(self, value: int):\n \"\"\"\n Initialize. \u0418\u043d\u0438\u0446\u0438\u0430\u043b\u0438\u0437\u0430\u0446\u0438\u044f\n :param argument:\n \"\"\"\n self.val = value\n\n def __add__(self, value):\n \"\"\"\n Add two numbers.\n \u0421\u043b\u043e\u0436\u0438\u0442\u044c 2 \u0447\u0438\u0441\u043b\u0430\n :param value:\n :return:\n \"\"\"\n return Number(self.val + value.val)\n\n def __repr__(self) -> str:\n \"\"\"\n Return representation\n :return: \u041f\u043e\u043b\u0443\u0447\u0438\u0442\u044c \u043f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u043b\u0435\u043d\u0438\u0435\n \"\"\"\n return f\"Number({self.val})\""} +{"id":100,"type":"Module","serialized_name":"Module_0x16d41315c8795210","mentioned_in":null,"string":null} +{"id":101,"type":"arg","serialized_name":"arg_0x16d41315c8c0c85f","mentioned_in":98,"string":"self"} +{"id":102,"type":"arguments","serialized_name":"arguments_0x16d41315c86b314b","mentioned_in":98,"string":null} +{"id":103,"type":"subword","serialized_name":"\u2581value","mentioned_in":null,"string":null} +{"id":104,"type":"mention","serialized_name":"value@FunctionDef_0x16d41315c8deaa67","mentioned_in":98,"string":null} +{"id":105,"type":"arg","serialized_name":"arg_0x16d41315c829b52d","mentioned_in":98,"string":"value: int"} +{"id":106,"type":"Assign","serialized_name":"Assign_0x16d41315c8f71d5b","mentioned_in":98,"string":"self.val = value"} +{"id":107,"type":"Attribute","serialized_name":"Attribute_0x16d41315c898a89c","mentioned_in":98,"string":"self.val"} +{"id":108,"type":"#attr#","serialized_name":"val","mentioned_in":null,"string":null} +{"id":110,"type":"mention","serialized_name":"__init__@ClassDef_0x16d41315c87e23dc","mentioned_in":99,"string":null} +{"id":111,"type":"mention","serialized_name":"self@FunctionDef_0x16d41315c8336015","mentioned_in":112,"string":null} +{"id":112,"type":"FunctionDef","serialized_name":"FunctionDef_0x16d41315c8336015","mentioned_in":99,"string":"def __add__(self, value):\n \"\"\"\n Add two numbers.\n \u0421\u043b\u043e\u0436\u0438\u0442\u044c 2 \u0447\u0438\u0441\u043b\u0430\n :param value:\n :return:\n \"\"\"\n return Number(self.val + value.val)"} +{"id":113,"type":"arg","serialized_name":"arg_0x16d41315c836c8e3","mentioned_in":112,"string":"self"} +{"id":114,"type":"arguments","serialized_name":"arguments_0x16d41315c89f1854","mentioned_in":112,"string":null} +{"id":115,"type":"mention","serialized_name":"value@FunctionDef_0x16d41315c8336015","mentioned_in":112,"string":null} +{"id":116,"type":"arg","serialized_name":"arg_0x16d41315c8b89fd0","mentioned_in":112,"string":"value"} +{"id":117,"type":"subword","serialized_name":"\u2581Number","mentioned_in":null,"string":null} +{"id":118,"type":"mention","serialized_name":"Number@FunctionDef_0x16d41315c8336015","mentioned_in":112,"string":null} +{"id":119,"type":"Call","serialized_name":"Call_0x16d41315c81ad247","mentioned_in":112,"string":"Number(self.val + value.val)"} +{"id":120,"type":"Attribute","serialized_name":"Attribute_0x16d41315c8e05daa","mentioned_in":112,"string":"self.val"} +{"id":121,"type":"BinOp","serialized_name":"BinOp_0x16d41315c81b0e8d","mentioned_in":112,"string":"self.val + value.val"} +{"id":122,"type":"Attribute","serialized_name":"Attribute_0x16d41315c83902eb","mentioned_in":112,"string":"value.val"} +{"id":123,"type":"Op","serialized_name":"Add","mentioned_in":null,"string":null} +{"id":124,"type":"Return","serialized_name":"Return_0x16d41315c814af65","mentioned_in":112,"string":"return Number(self.val + value.val)"} +{"id":125,"type":"mention","serialized_name":"__add__@ClassDef_0x16d41315c87e23dc","mentioned_in":99,"string":null} +{"id":126,"type":"subword","serialized_name":"add","mentioned_in":null,"string":null} +{"id":127,"type":"mention","serialized_name":"self@FunctionDef_0x16d41315c81c9bdf","mentioned_in":128,"string":null} +{"id":128,"type":"FunctionDef","serialized_name":"FunctionDef_0x16d41315c81c9bdf","mentioned_in":99,"string":"def __repr__(self) -> str:\n \"\"\"\n Return representation\n :return: \u041f\u043e\u043b\u0443\u0447\u0438\u0442\u044c \u043f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u043b\u0435\u043d\u0438\u0435\n \"\"\"\n return f\"Number({self.val})\""} +{"id":129,"type":"arg","serialized_name":"arg_0x16d41315c8766105","mentioned_in":128,"string":"self"} +{"id":130,"type":"arguments","serialized_name":"arguments_0x16d41315c8044c7c","mentioned_in":128,"string":null} +{"id":131,"type":"JoinedStr","serialized_name":"JoinedStr_","mentioned_in":null,"string":null} +{"id":132,"type":"Return","serialized_name":"Return_0x16d41315c8a05fc6","mentioned_in":128,"string":"return f\"Number({self.val})\""} +{"id":133,"type":"mention","serialized_name":"__repr__@ClassDef_0x16d41315c87e23dc","mentioned_in":99,"string":null} +{"id":134,"type":"subword","serialized_name":"repr","mentioned_in":null,"string":null} +{"id":135,"type":"mention","serialized_name":"Number@Module_0x16d41315c8795210","mentioned_in":100,"string":null} +{"id":136,"type":"subword","serialized_name":"\u2581Module","mentioned_in":null,"string":null} +{"id":137,"type":"mention","serialized_name":"Module@Module_0x16d41315c9361936","mentioned_in":138,"string":null} +{"id":138,"type":"Module","serialized_name":"Module_0x16d41315c9361936","mentioned_in":null,"string":null} +{"id":139,"type":"ImportFrom","serialized_name":"ImportFrom_0x16d41315c996d4c8","mentioned_in":138,"string":"from Module import Number"} +{"id":140,"type":"mention","serialized_name":"Number@Module_0x16d41315c9361936","mentioned_in":138,"string":null} +{"id":141,"type":"alias","serialized_name":"alias_0x16d41315c9a652fd","mentioned_in":138,"string":null} +{"id":142,"type":"mention","serialized_name":"Number@FunctionDef_0x16d41315c9c41f53","mentioned_in":143,"string":null} +{"id":143,"type":"FunctionDef","serialized_name":"FunctionDef_0x16d41315c9c41f53","mentioned_in":138,"string":"def main():\n a = Number(4)\n b = Number(5)\n print(a+b)"} +{"id":144,"type":"Call","serialized_name":"Call_0x16d41315c99e75f7","mentioned_in":143,"string":"Number(4)"} +{"id":145,"type":"Assign","serialized_name":"Assign_0x16d41315c909ace6","mentioned_in":143,"string":"a = Number(4)"} +{"id":146,"type":"subword","serialized_name":"\u2581a","mentioned_in":null,"string":null} +{"id":147,"type":"mention","serialized_name":"a@FunctionDef_0x16d41315c9c41f53","mentioned_in":143,"string":null} +{"id":148,"type":"Call","serialized_name":"Call_0x16d41315c9bee524","mentioned_in":143,"string":"Number(5)"} +{"id":149,"type":"Assign","serialized_name":"Assign_0x16d41315c9c3c775","mentioned_in":143,"string":"b = Number(5)"} +{"id":150,"type":"subword","serialized_name":"\u2581b","mentioned_in":null,"string":null} +{"id":151,"type":"mention","serialized_name":"b@FunctionDef_0x16d41315c9c41f53","mentioned_in":143,"string":null} +{"id":152,"type":"mention","serialized_name":"print@FunctionDef_0x16d41315c9c41f53","mentioned_in":143,"string":null} +{"id":153,"type":"Call","serialized_name":"Call_0x16d41315c9148a75","mentioned_in":143,"string":"print(a+b)"} +{"id":154,"type":"BinOp","serialized_name":"BinOp_0x16d41315c919b5a9","mentioned_in":143,"string":"a+b"} +{"id":155,"type":"mention","serialized_name":"main@Module_0x16d41315c9361936","mentioned_in":138,"string":null} +{"id":156,"type":"Call","serialized_name":"Call_0x16d41315c99405f0","mentioned_in":138,"string":"main()"} diff --git a/examples/small_graph/type_annotations.json b/examples/small_graph/type_annotations.json new file mode 100644 index 00000000..72c533f6 --- /dev/null +++ b/examples/small_graph/type_annotations.json @@ -0,0 +1,16 @@ +{"src":22,"dst":"int"} +{"src":24,"dst":"argument@FunctionDef_0x16d41315b61e0b59"} +{"src":35,"dst":"str"} +{"src":38,"dst":"FunctionDef_0x16d41315b69b5874"} +{"src":49,"dst":"str"} +{"src":38,"dst":"FunctionDef_0x16d41315b6ad90da"} +{"src":53,"dst":"int"} +{"src":24,"dst":"variable1@FunctionDef_0x16d41315b6ad90da"} +{"src":56,"dst":"str"} +{"src":38,"dst":"variable2@FunctionDef_0x16d41315b6ad90da"} +{"src":79,"dst":"None"} +{"src":78,"dst":"FunctionDef_0x16d41315b735b6ae"} +{"src":104,"dst":"int"} +{"src":24,"dst":"value@FunctionDef_0x16d41315c8deaa67"} +{"src":128,"dst":"str"} +{"src":38,"dst":"FunctionDef_0x16d41315c81c9bdf"} diff --git a/examples/var_misuse_tiny.yaml b/examples/var_misuse_tiny.yaml new file mode 100644 index 00000000..2e6bb39a --- /dev/null +++ b/examples/var_misuse_tiny.yaml @@ -0,0 +1,55 @@ +DATASET: + custom_reverse: null + data_path: ./new_scaa/with_ast + filter_edges: null + min_count_for_objectives: 5 + no_global_edges: false + random_seed: null + remove_reverse: false + restricted_id_pool: null + self_loops: false + subgraph_id_column: file_id + subgraph_partition: ./new_scaa/with_ast/subgraph_partition.json.bz2 + train_frac: 0.9 + use_edge_types: false + use_node_types: false +MODEL: + activation: tanh + dropout: 0.0 + h_dim: 100 + n_layers: 5 + node_emb_size: 100 + num_bases: 10 + use_att_checkpoint: false + use_gcn_checkpoint: false + use_gru_checkpoint: false + use_self_loop: true +TOKENIZER: + tokenizer_path: sentencepiece_bpe.model +TRAINING: + batch_size: 128 + dilate_scores: 1 + early_stopping: false + early_stopping_tolerance: 20 + elem_emb_size: 100 + embedding_table_size: 200000 + epochs: 100 + external_dataset: null + force_w2v_ns: false + gpu: -1 + learning_rate: 0.001 + measure_scores: true + metric: inner_prod + model_output_dir: ./new_scaa/with_ast + neg_sampling_factor: 3 + nn_index: brute + objectives: subgraph_clf + pretrained: null + pretraining_phase: 0 + restore_state: false + sampling_neighbourhood_size: 10 + save_checkpoints: true + save_each_epoch: false + schedule_layers_every: 10 + use_layer_scheduling: false + use_ns_groups: false diff --git a/figures/graph_examples/100.png b/figures/graph_examples/100.png new file mode 100644 index 00000000..86be27a8 Binary files /dev/null and b/figures/graph_examples/100.png differ diff --git a/figures/graph_examples/100_.png b/figures/graph_examples/100_.png new file mode 100644 index 00000000..ecb56188 Binary files /dev/null and b/figures/graph_examples/100_.png differ diff --git a/figures/graph_examples/112.png b/figures/graph_examples/112.png new file mode 100644 index 00000000..6966861e Binary files /dev/null and b/figures/graph_examples/112.png differ diff --git a/figures/graph_examples/118.png b/figures/graph_examples/118.png new file mode 100644 index 00000000..78b6ed01 Binary files /dev/null and b/figures/graph_examples/118.png differ diff --git a/figures/graph_examples/125.png b/figures/graph_examples/125.png new file mode 100644 index 00000000..afd49762 Binary files /dev/null and b/figures/graph_examples/125.png differ diff --git a/figures/graph_examples/139_.png b/figures/graph_examples/139_.png new file mode 100644 index 00000000..6a4ca38f Binary files /dev/null and b/figures/graph_examples/139_.png differ diff --git a/figures/graph_examples/140.png b/figures/graph_examples/140.png new file mode 100644 index 00000000..db2cbc0a Binary files /dev/null and b/figures/graph_examples/140.png differ diff --git a/figures/graph_examples/141.png b/figures/graph_examples/141.png new file mode 100644 index 00000000..4faf97c7 Binary files /dev/null and b/figures/graph_examples/141.png differ diff --git a/figures/graph_examples/144_.png b/figures/graph_examples/144_.png new file mode 100644 index 00000000..18c76ccf Binary files /dev/null and b/figures/graph_examples/144_.png differ diff --git a/figures/graph_examples/149.png b/figures/graph_examples/149.png new file mode 100644 index 00000000..e1e59e34 Binary files /dev/null and b/figures/graph_examples/149.png differ diff --git a/figures/graph_examples/16_.png b/figures/graph_examples/16_.png new file mode 100644 index 00000000..f03a2674 Binary files /dev/null and b/figures/graph_examples/16_.png differ diff --git a/figures/graph_examples/17_.png b/figures/graph_examples/17_.png new file mode 100644 index 00000000..9398955f Binary files /dev/null and b/figures/graph_examples/17_.png differ diff --git a/figures/graph_examples/35_.png b/figures/graph_examples/35_.png new file mode 100644 index 00000000..c814b7a0 Binary files /dev/null and b/figures/graph_examples/35_.png differ diff --git a/figures/graph_examples/40.png b/figures/graph_examples/40.png new file mode 100644 index 00000000..eeea76d5 Binary files /dev/null and b/figures/graph_examples/40.png differ diff --git a/figures/graph_examples/49_.png b/figures/graph_examples/49_.png new file mode 100644 index 00000000..94cb9370 Binary files /dev/null and b/figures/graph_examples/49_.png differ diff --git a/figures/graph_examples/54.png b/figures/graph_examples/54.png new file mode 100644 index 00000000..887f0f06 Binary files /dev/null and b/figures/graph_examples/54.png differ diff --git a/figures/graph_examples/60.png b/figures/graph_examples/60.png new file mode 100644 index 00000000..9ea01603 Binary files /dev/null and b/figures/graph_examples/60.png differ diff --git a/figures/graph_examples/61.png b/figures/graph_examples/61.png new file mode 100644 index 00000000..b2899e0c Binary files /dev/null and b/figures/graph_examples/61.png differ diff --git a/figures/graph_examples/67_.png b/figures/graph_examples/67_.png new file mode 100644 index 00000000..96b62864 Binary files /dev/null and b/figures/graph_examples/67_.png differ diff --git a/figures/graph_examples/69.png b/figures/graph_examples/69.png new file mode 100644 index 00000000..8919ed13 Binary files /dev/null and b/figures/graph_examples/69.png differ diff --git a/figures/graph_examples/78.png b/figures/graph_examples/78.png new file mode 100644 index 00000000..b0294e70 Binary files /dev/null and b/figures/graph_examples/78.png differ diff --git a/figures/graph_examples/80_.png b/figures/graph_examples/80_.png new file mode 100644 index 00000000..aefe5039 Binary files /dev/null and b/figures/graph_examples/80_.png differ diff --git a/figures/graph_examples/no_global_index.png b/figures/graph_examples/no_global_index.png new file mode 100644 index 00000000..983eb59d Binary files /dev/null and b/figures/graph_examples/no_global_index.png differ diff --git a/main.py b/main.py new file mode 100644 index 00000000..3f2f4803 --- /dev/null +++ b/main.py @@ -0,0 +1,93 @@ +from tensorboard import program +from random import random + +from SourceCodeTools.models.training_config import get_config, save_config, load_config +from SourceCodeTools.code.data.dataset.Dataset import SourceGraphDataset, filter_dst_by_freq +from SourceCodeTools.models.graph.train.sampling_multitask2 import training_procedure, SamplingMultitaskTrainer +from SourceCodeTools.models.graph.train.objectives.NodeClassificationObjective import NodeClassifierObjective +from SourceCodeTools.models.graph.train.objectives.SubgraphClassifierObjective import SubgraphAbstractObjective, \ + SubgraphClassifierObjective, SubgraphEmbeddingObjective +from SourceCodeTools.models.graph.train.objectives.SubgraphEmbedderObjective import SubgraphEmbeddingObjective, \ + SubgraphMatchingObjective +from SourceCodeTools.models.graph.train.utils import get_name, get_model_base +from SourceCodeTools.models.graph import RGGAN +from SourceCodeTools.tabular.common import compact_property +from SourceCodeTools.code.data.file_utils import unpersist +from SourceCodeTools.models.graph.train.objectives.SCAAClassifierObjetive import SCAAClassifierObjective + + +import dgl +import torch +import numpy as np +from argparse import Namespace +from torch import nn +from datetime import datetime +from os.path import join +from functools import partial + +tokenizer_path = "sentencepiece_bpe.model" + +data_path = "./examples/one_vs_10/with_ast" +subgraph_partition = join(data_path, "subgraph_partition.json.bz2") +filecontent_path = join(data_path, "common_filecontent.json.bz2") + +config = get_config( + subgraph_id_column="file_id", + data_path=data_path, + model_output_dir=data_path, + subgraph_partition=subgraph_partition, + tokenizer_path=tokenizer_path, + objectives="subgraph_clf", + measure_scores=True, + dilate_scores=1, + epochs=100, +) +save_config(config, "var_misuse_tiny.yaml") + +dataset = SourceGraphDataset( + **{**config["DATASET"], **config["TOKENIZER"]} +) + + +def load_labels(): + filecontent = unpersist(filecontent_path) + return filecontent[["id", "user"]].rename({"id": "src", "user": "dst"}, axis=1) + + +load_labels() + + +class Trainer(SamplingMultitaskTrainer): + def create_objectives(self, dataset, tokenizer_path): + self.objectives = nn.ModuleList() + + self.objectives.append( + self._create_subgraph_objective( + objective_name="SCAAMatching", + objective_class=SCAAClassifierObjective, + dataset=dataset, + tokenizer_path=tokenizer_path, + labels_fn=load_labels, + ) + ) + + +# %tensorboard --logdir data + +# the path of your log file. +tracking_address = './examples/one_vs_10/old_pooling' + +tb = program.TensorBoard() +tb.configure(argv=[None, '--logdir', tracking_address]) +url = tb.launch() +print(f"Tensorflow listening on {url}") + +training_procedure( + dataset, + model_name=RGGAN, + model_params=config["MODEL"], + trainer_params=config["TRAINING"], + model_base_path=get_model_base( + config["TRAINING"], get_name(RGGAN, str(datetime.now()))), + trainer=Trainer +) diff --git a/res/python_testdata/example_code/example/ExampleModule.py b/res/python_testdata/example_code/example/ExampleModule.py new file mode 100644 index 00000000..44700482 --- /dev/null +++ b/res/python_testdata/example_code/example/ExampleModule.py @@ -0,0 +1,24 @@ +class ExampleClass: + def __init__(self, argument: int): + """ + Initialize. Инициализация + :param argument: + """ + self.field = argument + + def method1(self) -> str: + """ + Call another method. Вызов другого метода. + :return: + """ + return self.method2() + + def method2(self) -> str: + """ + Simple operations. + Простые операции. + :return: + """ + variable1: int = self.field + variable2: str = str(variable1) + return variable2 diff --git a/res/python_testdata/example_code/example/main.py b/res/python_testdata/example_code/example/main.py new file mode 100644 index 00000000..30caa43d --- /dev/null +++ b/res/python_testdata/example_code/example/main.py @@ -0,0 +1,9 @@ +from ExampleModule import ExampleClass as EC + +instance = EC(5) + +def main() -> None: + print(instance.method1()) + +main() + diff --git a/res/python_testdata/example_code/example2/Module.py b/res/python_testdata/example_code/example2/Module.py new file mode 100644 index 00000000..6c949d86 --- /dev/null +++ b/res/python_testdata/example_code/example2/Module.py @@ -0,0 +1,24 @@ +class Number: + def __init__(self, value: int): + """ + Initialize. Инициализация + :param argument: + """ + self.val = value + + def __add__(self, value): + """ + Add two numbers. + Сложить 2 числа + :param value: + :return: + """ + return Number(self.val + value.val) + + def __repr__(self) -> str: + """ + Return representation + :return: Получить представление + """ + return f"Number({self.val})" + diff --git a/res/python_testdata/example_code/example2/main.py b/res/python_testdata/example_code/example2/main.py new file mode 100644 index 00000000..77b3999d --- /dev/null +++ b/res/python_testdata/example_code/example2/main.py @@ -0,0 +1,9 @@ +from Module import Number + +def main(): + a = Number(4) + b = Number(5) + print(a+b) + +main() + diff --git a/res/python_testdata/hops_example/README.md b/res/python_testdata/hops_example/README.md new file mode 100644 index 00000000..3b9f8938 --- /dev/null +++ b/res/python_testdata/hops_example/README.md @@ -0,0 +1 @@ +Small example suitable for visualizing k-hop graph \ No newline at end of file diff --git a/res/python_testdata/hops_example/hops_example.csv b/res/python_testdata/hops_example/hops_example.csv new file mode 100644 index 00000000..acc66f85 --- /dev/null +++ b/res/python_testdata/hops_example/hops_example.csv @@ -0,0 +1,5 @@ +id,filecontent,package +1,"def method2(self) -> str: + variable1: int = self.field + variable2: str = str(variable1) + return variable2",hops \ No newline at end of file diff --git a/res/python_testdata/small_examples/README.md b/res/python_testdata/small_examples/README.md new file mode 100644 index 00000000..49a2c8e6 --- /dev/null +++ b/res/python_testdata/small_examples/README.md @@ -0,0 +1 @@ +Test for creating small visualizations of graph building blocks \ No newline at end of file diff --git a/res/python_testdata/small_examples/small_examples.csv b/res/python_testdata/small_examples/small_examples.csv new file mode 100644 index 00000000..d490381a --- /dev/null +++ b/res/python_testdata/small_examples/small_examples.csv @@ -0,0 +1,80 @@ +id,filecontent,package +1,"with open(a) as b: + do_stuff(b)",small_examples +2,a: int = 5,small_examples +3,"def m(): + pass",small_examples +4,[i for i in list if i != 5],small_examples +5,"for i in list: + k = fn(i) + if k == 4: + fn2(k) + break +else: + fn2(0)",small_examples +6,"if d is True: + a = b +else: + a = c",small_examples +7,a = 5 if b is True else 0,small_examples +8,"try: + a = b +except Exception as e: + a = c +else: + a = d +finally: + print(a)",small_examples +9,"def f(a: int = 5): + return a",small_examples +10,"def __init__(self, argument: int): + """""" + Initialize. Инициализация + :param argument: + """""" + self.field = argument",small_examples +11,"def method2(self) -> str: + """""" + Simple operations. + Простые операции. + :return: + """""" + variable1: int = self.field + variable2: str = str(variable1) + return variable2",small_examples +12,"def main(): + a = Number(4) + b = Number(5) + print(a+b)",small_examples +13,"class ExampleClass: + def __init__(self, argument: int): + """""" + Initialize. Инициализация + :param argument: + """""" + self.field = argument + + def method1(self) -> str: + """""" + Call another method. Вызов другого метода. + :return: + """""" + return self.method2() + + def method2(self) -> str: + """""" + Simple operations. + Простые операции. + :return: + """""" + variable1: int = self.field + variable2: str = str(variable1) + return variable2",small_examples +14,"from Module import Number + +def main(): + a = Number(4) + b = Number(5) + print(a+b) + +main()",small_examples \ No newline at end of file diff --git a/res/python_testdata/syntax_test/README.md b/res/python_testdata/syntax_test/README.md new file mode 100644 index 00000000..e9350379 --- /dev/null +++ b/res/python_testdata/syntax_test/README.md @@ -0,0 +1 @@ +Test for ast parser \ No newline at end of file diff --git a/res/python_testdata/syntax_test/syntax_test.py b/res/python_testdata/syntax_test/syntax_test.py new file mode 100644 index 00000000..11b088c2 --- /dev/null +++ b/res/python_testdata/syntax_test/syntax_test.py @@ -0,0 +1,64 @@ +def __init__(self, tqdm_cls, sleep_interval): + Thread.__init__(self) + self.daemon = True # kill thread when main killed (KeyboardInterrupt) + self.was_killed = Event() + self.woken = 0 # last time woken up, to sync with monitor + self.tqdm_cls = tqdm_cls + self.sleep_interval = sleep_interval + if TMonitor._time is not None: + self._time = TMonitor._time + else: + self._time = time + if TMonitor._event is not None: + self._event = TMonitor._event + elif a not in c: + self._event = Event + atexit.register(self.exit) + self.start() + test_call(noop) + test_call2(noop); test_call3(noop) + + a.a.b = 4 + + try: + a = b + except ValueError as e: + b = c + except: + b = a + else: + b = c + finally: + a = c + + for t in tt: + print(tt) + continue + + for t in range(5): + if a in t: + good = True + break + else: + good = False + + + while b: + pass + + c = [1, 2, 3, [1, 2]] + g = tuple(a for a in d if a == b) + g = [a for a in d if a == b] + g = a + b + g += c + a,b = c,d + + if a==0 and b==0: + c = 6 + + if a: + c = 4 + + d = (1, 2) + c = {3, 4, Foo(Bar())} + e = {1:2, 3:4} diff --git a/res/python_testdata/var_misuse_pico/README.md b/res/python_testdata/var_misuse_pico/README.md new file mode 100644 index 00000000..de39bb7b --- /dev/null +++ b/res/python_testdata/var_misuse_pico/README.md @@ -0,0 +1 @@ +Test ast graph for cubert benchmark \ No newline at end of file diff --git a/res/python_testdata/var_misuse_pico/variable_misuse_test.csv b/res/python_testdata/var_misuse_pico/variable_misuse_test.csv new file mode 100644 index 00000000..da7b86f3 --- /dev/null +++ b/res/python_testdata/var_misuse_pico/variable_misuse_test.csv @@ -0,0 +1,989 @@ +id,fn_path,filecontent,comment,package,label,partition +0,numba/numba/numba/ir.py/Expr.cast,"@classmethod +def cast(cls, value, loc): + '\n A node for implicit casting at the return statement\n ' + op = 'cast' + return cls(op=op, value=value, loc=loc)",original,numba,Correct,train +1,numba/numba/numba/ir.py/Expr.cast,"@classmethod +def cast(cls, value, loc): + '\n A node for implicit casting at the return statement\n ' + op = 'cast' + return cls(op=op, value=value, loc=cls)",VarMisuse@36/62 `loc`->`cls`,numba,Variable misuse,train +2,PyTables/PyTables/tables/utils.py/string_to_classes,"def string_to_classes(s): + if (s == '*'): + c = sorted(six.iterkeys(tracked_classes)) + return c + else: + return s.split()",original,PyTables,Correct,train +3,PyTables/PyTables/tables/utils.py/string_to_classes,"def string_to_classes(s): + if (c == '*'): + c = sorted(six.iterkeys(tracked_classes)) + return c + else: + return s.split()",VarMisuse@12/15 `s`->`c`,PyTables,Variable misuse,train +4,openstack/taskflow/taskflow/jobs/base.py/Job.__init__,"def __init__(self, board, name, uuid=None, details=None, backend=None, book=None, book_data=None): + if uuid: + self._uuid = uuid + else: + self._uuid = uuidutils.generate_uuid() + self._name = name + if (not details): + details = { + + } + self._details = details + self._backend = backend + self._board = board + self._book = book + if (not book_data): + book_data = { + + } + self._book_data = book_data",original,openstack,Correct,train +5,openstack/taskflow/taskflow/jobs/base.py/Job.__init__,"def __init__(self, board, name, uuid=None, details=None, backend=None, book=None, book_data=None): + if uuid: + self._uuid = uuid + else: + self._uuid = uuidutils.generate_uuid() + self._name = name + if (not details): + details = { + + } + self._details = details + self._backend = backend + self._board = board + self._book = book + if (not book_data): + book_data = { + + } + name._book_data = book_data",VarMisuse@123/144 `self`->`name`,openstack,Variable misuse,train +6,openstack/neutron/neutron/tests/tools.py/get_random_ip_address,"def get_random_ip_address(version=4): + if (version == 4): + ip_string = ('10.%d.%d.%d' % (random.randint(3, 254), random.randint(3, 254), random.randint(3, 254))) + return netaddr.IPAddress(ip_string) + else: + ip = ipv6_utils.get_ipv6_addr_by_EUI64('2001:db8::/64', get_random_mac()) + return ip",original,openstack,Correct,train +7,openstack/neutron/neutron/tests/tools.py/get_random_ip_address,"def get_random_ip_address(version=4): + if (ip == 4): + ip_string = ('10.%d.%d.%d' % (random.randint(3, 254), random.randint(3, 254), random.randint(3, 254))) + return netaddr.IPAddress(ip_string) + else: + ip = ipv6_utils.get_ipv6_addr_by_EUI64('2001:db8::/64', get_random_mac()) + return ip",VarMisuse@14/18 `version`->`ip`,openstack,Variable misuse,train +8,c00w/bitHopper/bitHopper/Configuration/Miners.py/load_from_db,"def load_from_db(): + '\n Load miners from database\n ' + columns = ['Username TEXT', 'Password TEXT'] + bitHopper.Database.Commands.Create_Table('Miners', columns) + results = bitHopper.Database.execute('SELECT Username, Password FROM Miners') + miners = set() + for (username, password) in results: + miners.add((username, password)) + return miners",original,c00w,Correct,train +9,c00w/bitHopper/bitHopper/Configuration/Miners.py/load_from_db,"def load_from_db(): + '\n Load miners from database\n ' + columns = ['Username TEXT', 'Password TEXT'] + bitHopper.Database.Commands.Create_Table('Miners', columns) + results = bitHopper.Database.execute('SELECT Username, Password FROM Miners') + miners = set() + for (username, password) in results: + miners.add((username, results)) + return miners",VarMisuse@67/116 `password`->`results`,c00w,Variable misuse,train +10,sympy/sympy/sympy/core/logic.py/_fuzzy_group,"def _fuzzy_group(args, quick_exit=False): + 'Return True if all args are True, None if there is any None else False\n unless ``quick_exit`` is True (then return None as soon as a second False\n is seen.\n\n ``_fuzzy_group`` is like ``fuzzy_and`` except that it is more\n conservative in returning a False, waiting to make sure that all\n arguments are True or False and returning None if any arguments are\n None. It also has the capability of permiting only a single False and\n returning None if more than one is seen. For example, the presence of a\n single transcendental amongst rationals would indicate that the group is\n no longer rational; but a second transcendental in the group would make the\n determination impossible.\n\n\n Examples\n ========\n\n >>> from sympy.core.logic import _fuzzy_group\n\n By default, multiple Falses mean the group is broken:\n\n >>> _fuzzy_group([False, False, True])\n False\n\n If multiple Falses mean the group status is unknown then set\n `quick_exit` to True so None can be returned when the 2nd False is seen:\n\n >>> _fuzzy_group([False, False, True], quick_exit=True)\n\n But if only a single False is seen then the group is known to\n be broken:\n\n >>> _fuzzy_group([False, True, True], quick_exit=True)\n False\n\n ' + saw_other = False + for a in args: + if (a is True): + continue + if (a is None): + return + if (quick_exit and saw_other): + return + saw_other = True + return (not saw_other)",original,sympy,Correct,train +11,sympy/sympy/sympy/core/logic.py/_fuzzy_group,"def _fuzzy_group(args, quick_exit=False): + 'Return True if all args are True, None if there is any None else False\n unless ``quick_exit`` is True (then return None as soon as a second False\n is seen.\n\n ``_fuzzy_group`` is like ``fuzzy_and`` except that it is more\n conservative in returning a False, waiting to make sure that all\n arguments are True or False and returning None if any arguments are\n None. It also has the capability of permiting only a single False and\n returning None if more than one is seen. For example, the presence of a\n single transcendental amongst rationals would indicate that the group is\n no longer rational; but a second transcendental in the group would make the\n determination impossible.\n\n\n Examples\n ========\n\n >>> from sympy.core.logic import _fuzzy_group\n\n By default, multiple Falses mean the group is broken:\n\n >>> _fuzzy_group([False, False, True])\n False\n\n If multiple Falses mean the group status is unknown then set\n `quick_exit` to True so None can be returned when the 2nd False is seen:\n\n >>> _fuzzy_group([False, False, True], quick_exit=True)\n\n But if only a single False is seen then the group is known to\n be broken:\n\n >>> _fuzzy_group([False, True, True], quick_exit=True)\n False\n\n ' + saw_other = False + for a in args: + if (a is True): + continue + if (a is None): + return + if (quick_exit and a): + return + saw_other = True + return (not saw_other)",VarMisuse@55/601 `saw_other`->`a`,sympy,Variable misuse,train +12,scipy/scipy/scipy/integrate/tests/test_quadpack.py/TestNQuad.test_matching_tplquad,"def test_matching_tplquad(self): + + def func3d(x0, x1, x2, c0, c1): + return (((((x0 ** 2) + (c0 * (x1 ** 3))) - (x0 * x1)) + 1) + (c1 * np.sin(x2))) + res = tplquad(func3d, (- 1), 2, (lambda x: (- 2)), (lambda x: 2), (lambda x, y: (- np.pi)), (lambda x, y: np.pi), args=(2, 3)) + res2 = nquad(func3d, [[(- np.pi), np.pi], [(- 2), 2], ((- 1), 2)], args=(2, 3)) + assert_almost_equal(res, res2)",original,scipy,Correct,train +13,scipy/scipy/scipy/integrate/tests/test_quadpack.py/TestNQuad.test_matching_tplquad,"def test_matching_tplquad(self): + + def func3d(x0, x1, x2, c0, c1): + return (((((x0 ** 2) + (c0 * (x1 ** 3))) - (x0 * x1)) + 1) + (c1 * np.sin(x2))) + res = tplquad(func3d, (- 1), 2, (lambda x: (- 2)), (lambda x: 2), (lambda x, y: (- np.pi)), (lambda x, y: np.pi), args=(2, 3)) + res2 = nquad(func3d, [[(- np.pi), np.pi], [(- 2), 2], ((- 1), 2)], args=(2, 3)) + assert_almost_equal(res2, res2)",VarMisuse@186/199 `res`->`res2`,scipy,Variable misuse,train +14,airbnb/airflow/airflow/models.py/DAG.owner,"@property +def owner(self): + return ', '.join(list(set([t.owner for t in self.tasks])))",original,airbnb,Correct,train +15,airbnb/airflow/airflow/models.py/DAG.owner,"@property +def owner(self): + return ', '.join(list(set([t.owner for t in t.tasks])))",VarMisuse@29/32 `self`->`t`,airbnb,Variable misuse,train +16,dimagi/commcare-hq/corehq/apps/es/tests/test_aggregations.py/TestAggregations.test_extended_stats_aggregation,"def test_extended_stats_aggregation(self): + json_output = { + 'query': { + 'filtered': { + 'filter': { + 'and': [{ + 'match_all': { + + }, + }], + }, + 'query': { + 'match_all': { + + }, + }, + }, + }, + 'aggs': { + 'name_stats': { + 'extended_stats': { + 'field': 'name', + 'script': 'MY weird script', + }, + }, + }, + 'size': SIZE_LIMIT, + } + query = HQESQuery('cases').aggregation(ExtendedStatsAggregation('name_stats', 'name', script='MY weird script')) + self.checkQuery(query, json_output)",original,dimagi,Correct,train +17,dimagi/commcare-hq/corehq/apps/es/tests/test_aggregations.py/TestAggregations.test_extended_stats_aggregation,"def test_extended_stats_aggregation(self): + json_output = { + 'query': { + 'filtered': { + 'filter': { + 'and': [{ + 'match_all': { + + }, + }], + }, + 'query': { + 'match_all': { + + }, + }, + }, + }, + 'aggs': { + 'name_stats': { + 'extended_stats': { + 'field': 'name', + 'script': 'MY weird script', + }, + }, + }, + 'size': SIZE_LIMIT, + } + query = HQESQuery('cases').aggregation(ExtendedStatsAggregation('name_stats', 'name', script='MY weird script')) + query.checkQuery(query, json_output)",VarMisuse@126/199 `self`->`query`,dimagi,Variable misuse,train +18,sympy/sympy/sympy/polys/compatibility.py/IPolys.dmp_irreducible_p,"def dmp_irreducible_p(self, f): + return dmp_irreducible_p(self.to_dense(f), (self.ngens - 1), self.domain)",original,sympy,Correct,train +19,sympy/sympy/sympy/polys/compatibility.py/IPolys.dmp_irreducible_p,"def dmp_irreducible_p(self, f): + return dmp_irreducible_p(f.to_dense(f), (self.ngens - 1), self.domain)",VarMisuse@15/28 `self`->`f`,sympy,Variable misuse,train +20,circus-tent/circus/circus/arbiter.py/Arbiter.reload,"@synchronized('arbiter_reload') +@gen.coroutine +@debuglog +def reload(self, graceful=True, sequential=False): + 'Reloads everything.\n\n Run the :func:`prereload_fn` callable if any, then gracefuly\n reload all watchers.\n ' + if self._stopping: + return + if (self.prereload_fn is not None): + self.prereload_fn(self) + for handler in logger.handlers: + if isinstance(handler, logging.FileHandler): + handler.acquire() + handler.stream.close() + handler.stream = open(handler.baseFilename, handler.mode) + handler.release() + for watcher in self.iter_watchers(): + (yield watcher._reload(graceful=graceful, sequential=sequential)) + tornado_sleep(self.warmup_delay)",original,circus-tent,Correct,train +21,circus-tent/circus/circus/arbiter.py/Arbiter.reload,"@synchronized('arbiter_reload') +@gen.coroutine +@debuglog +def reload(self, graceful=True, sequential=False): + 'Reloads everything.\n\n Run the :func:`prereload_fn` callable if any, then gracefuly\n reload all watchers.\n ' + if self._stopping: + return + if (self.prereload_fn is not None): + self.prereload_fn(self) + for handler in logger.handlers: + if isinstance(handler, logging.FileHandler): + handler.acquire() + handler.stream.close() + handler.stream = open(handler.baseFilename, handler.mode) + handler.release() + for watcher in self.iter_watchers(): + (yield watcher._reload(graceful=self, sequential=sequential)) + tornado_sleep(self.warmup_delay)",VarMisuse@141/217 `graceful`->`self`,circus-tent,Variable misuse,train +22,jnrbsn/daemonocle/tests/test_shutdown.py/test_unresponsive_stop,"def test_unresponsive_stop(pyscript): + script = pyscript(""\n import signal\n import sys\n import time\n from daemonocle import Daemon\n\n def worker():\n def handle_sigterm(*args, **kwargs):\n time.sleep(10)\n\n signal.signal(signal.SIGTERM, handle_sigterm)\n time.sleep(10)\n\n daemon = Daemon(worker=worker, prog='foo', pidfile='foo.pid',\n stop_timeout=1)\n daemon.do_action(sys.argv[1])\n "") + pidfile = os.path.realpath(os.path.join(script.dirname, 'foo.pid')) + script.run('start') + with open(pidfile, 'rb') as f: + pid = int(f.read()) + result = script.run('stop') + assert (result.returncode == 1) + assert (result.stdout == b'Stopping foo ... FAILED\n') + assert (result.stderr == 'ERROR: Timed out while waiting for process (PID {pid}) to terminate\n'.format(pid=pid).encode('utf-8')) + assert psutil.pid_exists(pid) + os.kill(pid, signal.SIGKILL) + try: + proc = psutil.Process(pid) + except psutil.NoSuchProcess: + pass + else: + proc.wait(timeout=1)",original,jnrbsn,Correct,train +23,jnrbsn/daemonocle/tests/test_shutdown.py/test_unresponsive_stop,"def test_unresponsive_stop(pyscript): + script = result(""\n import signal\n import sys\n import time\n from daemonocle import Daemon\n\n def worker():\n def handle_sigterm(*args, **kwargs):\n time.sleep(10)\n\n signal.signal(signal.SIGTERM, handle_sigterm)\n time.sleep(10)\n\n daemon = Daemon(worker=worker, prog='foo', pidfile='foo.pid',\n stop_timeout=1)\n daemon.do_action(sys.argv[1])\n "") + pidfile = os.path.realpath(os.path.join(script.dirname, 'foo.pid')) + script.run('start') + with open(pidfile, 'rb') as f: + pid = int(f.read()) + result = script.run('stop') + assert (result.returncode == 1) + assert (result.stdout == b'Stopping foo ... FAILED\n') + assert (result.stderr == 'ERROR: Timed out while waiting for process (PID {pid}) to terminate\n'.format(pid=pid).encode('utf-8')) + assert psutil.pid_exists(pid) + os.kill(pid, signal.SIGKILL) + try: + proc = psutil.Process(pid) + except psutil.NoSuchProcess: + pass + else: + proc.wait(timeout=1)",VarMisuse@12/20 `pyscript`->`result`,jnrbsn,Variable misuse,train +24,openstack/rally/tests/unit/plugins/openstack/context/sahara/test_sahara_output_data_sources.py/SaharaOutputDataSourcesTestCase.setUp,"def setUp(self): + super(SaharaOutputDataSourcesTestCase, self).setUp() + fake_dict = FakeDict() + fake_dict.__dict__['username'] = 'user' + fake_dict.__dict__['password'] = 'passwd' + self.tenants_num = 2 + self.users_per_tenant = 2 + self.users = (self.tenants_num * self.users_per_tenant) + self.task = mock.MagicMock() + self.tenants = { + + } + self.users_key = [] + for i in range(self.tenants_num): + self.tenants[str(i)] = { + 'id': str(i), + 'name': str(i), + 'sahara': { + 'image': '42', + }, + } + for j in range(self.users_per_tenant): + self.users_key.append({ + 'id': ('%s_%s' % (str(i), str(j))), + 'tenant_id': str(i), + 'credential': fake_dict, + }) + self.user_key = [{ + 'id': i, + 'tenant_id': j, + 'credential': 'credential', + } for j in range(self.tenants_num) for i in range(self.users_per_tenant)] + self.context.update({ + 'config': { + 'users': { + 'tenants': self.tenants_num, + 'users_per_tenant': self.users_per_tenant, + }, + 'sahara_output_data_sources': { + 'output_type': 'hdfs', + 'output_url_prefix': 'hdfs://test_host/', + }, + }, + 'admin': { + 'credential': mock.MagicMock(), + }, + 'task': mock.MagicMock(), + 'users': self.users_key, + 'tenants': self.tenants, + })",original,openstack,Correct,train +25,openstack/rally/tests/unit/plugins/openstack/context/sahara/test_sahara_output_data_sources.py/SaharaOutputDataSourcesTestCase.setUp,"def setUp(self): + super(SaharaOutputDataSourcesTestCase, self).setUp() + fake_dict = FakeDict() + fake_dict.__dict__['username'] = 'user' + fake_dict.__dict__['password'] = 'passwd' + self.tenants_num = 2 + self.users_per_tenant = 2 + self.users = (self.tenants_num * self.users_per_tenant) + self.task = mock.MagicMock() + self.tenants = { + + } + self.users_key = [] + for i in range(self.tenants_num): + j.tenants[str(i)] = { + 'id': str(i), + 'name': str(i), + 'sahara': { + 'image': '42', + }, + } + for j in range(self.users_per_tenant): + self.users_key.append({ + 'id': ('%s_%s' % (str(i), str(j))), + 'tenant_id': str(i), + 'credential': fake_dict, + }) + self.user_key = [{ + 'id': i, + 'tenant_id': j, + 'credential': 'credential', + } for j in range(self.tenants_num) for i in range(self.users_per_tenant)] + self.context.update({ + 'config': { + 'users': { + 'tenants': self.tenants_num, + 'users_per_tenant': self.users_per_tenant, + }, + 'sahara_output_data_sources': { + 'output_type': 'hdfs', + 'output_url_prefix': 'hdfs://test_host/', + }, + }, + 'admin': { + 'credential': mock.MagicMock(), + }, + 'task': mock.MagicMock(), + 'users': self.users_key, + 'tenants': self.tenants, + })",VarMisuse@109/149 `self`->`j`,openstack,Variable misuse,train +26,edgewall/trac/trac/ticket/default_workflow.py/ConfigurableTicketWorkflow.render_ticket_action_control,"def render_ticket_action_control(self, req, ticket, action): + self.log.debug('render_ticket_action_control: action ""%s""', action) + this_action = self.actions[action] + status = this_action['newstate'] + operations = this_action['operations'] + current_owner = ticket._old.get('owner', ticket['owner']) + author = get_reporter_id(req, 'author') + author_info = partial(Chrome(self.env).authorinfo, req, resource=ticket.resource) + format_author = partial(Chrome(self.env).format_author, req, resource=ticket.resource) + formatted_current_owner = author_info(current_owner) + exists = (ticket._old.get('status', ticket['status']) is not None) + control = [] + hints = [] + if ('reset_workflow' in operations): + control.append(_('from invalid state')) + hints.append(_('Current state no longer exists')) + if ('del_owner' in operations): + hints.append(_('The ticket will be disowned')) + if (('set_owner' in operations) or ('may_set_owner' in operations)): + if ('set_owner' in this_action): + owners = self._to_users(this_action['set_owner'], ticket) + elif self.config.getbool('ticket', 'restrict_owner'): + perm = PermissionSystem(self.env) + owners = perm.get_users_with_permission('TICKET_MODIFY') + owners = [user for user in owners if ('TICKET_MODIFY' in PermissionCache(self.env, user, ticket.resource))] + owners = sorted(owners) + else: + owners = None + if ('set_owner' in operations): + default_owner = author + elif ('may_set_owner' in operations): + if (not exists): + default_owner = TicketSystem(self.env).default_owner + else: + default_owner = ticket._old.get('owner', (ticket['owner'] or None)) + if ((owners is not None) and (default_owner not in owners)): + owners.insert(0, default_owner) + else: + raise AssertionError(operations) + id = ('action_%s_reassign_owner' % action) + if (not owners): + owner = req.args.get(id, default_owner) + control.append(tag_('to %(owner)s', owner=tag.input(type='text', id=id, name=id, value=owner))) + if ((not exists) or (current_owner is None)): + hints.append(_('The owner will be the specified user')) + else: + hints.append(tag_('The owner will be changed from %(current_owner)s to the specified user', current_owner=formatted_current_owner)) + elif (len(owners) == 1): + owner = tag.input(type='hidden', id=id, name=id, value=owners[0]) + formatted_new_owner = author_info(owners[0]) + control.append(tag_('to %(owner)s', owner=tag(formatted_new_owner, owner))) + if ((not exists) or (current_owner is None)): + hints.append(tag_('The owner will be %(new_owner)s', new_owner=formatted_new_owner)) + elif (ticket['owner'] != owners[0]): + hints.append(tag_('The owner will be changed from %(current_owner)s to %(new_owner)s', current_owner=formatted_current_owner, new_owner=formatted_new_owner)) + else: + selected_owner = req.args.get(id, default_owner) + control.append(tag_('to %(owner)s', owner=tag.select([tag.option(format_author(x), value=(x if (x is not None) else ''), selected=((x == selected_owner) or None)) for x in owners], id=id, name=id))) + if ((not exists) or (current_owner is None)): + hints.append(_('The owner will be the selected user')) + else: + hints.append(tag_('The owner will be changed from %(current_owner)s to the selected user', current_owner=formatted_current_owner)) + elif (('set_owner_to_self' in operations) and (ticket._old.get('owner', ticket['owner']) != author)): + formatted_author = author_info(author) + if ((not exists) or (current_owner is None)): + hints.append(tag_('The owner will be %(new_owner)s', new_owner=formatted_author)) + else: + hints.append(tag_('The owner will be changed from %(current_owner)s to %(new_owner)s', current_owner=formatted_current_owner, new_owner=formatted_author)) + if ('set_resolution' in operations): + if ('set_resolution' in this_action): + resolutions = this_action['set_resolution'] + else: + resolutions = [r.name for r in Resolution.select(self.env)] + if (not resolutions): + raise TracError(_('Your workflow attempts to set a resolution but none is defined (configuration issue, please contact your Trac admin).')) + id = ('action_%s_resolve_resolution' % action) + if (len(resolutions) == 1): + resolution = tag.input(type='hidden', id=id, name=id, value=resolutions[0]) + control.append(tag_('as %(resolution)s', resolution=tag(resolutions[0], resolution))) + hints.append(tag_('The resolution will be set to %(name)s', name=resolutions[0])) + else: + selected_option = req.args.get(id, TicketSystem(self.env).default_resolution) + control.append(tag_('as %(resolution)s', resolution=tag.select([tag.option(x, value=x, selected=((x == selected_option) or None)) for x in resolutions], id=id, name=id))) + hints.append(_('The resolution will be set')) + if ('del_resolution' in operations): + hints.append(_('The resolution will be deleted')) + if ('leave_status' in operations): + control.append(tag_('as %(status)s', status=ticket._old.get('status', ticket['status']))) + if (len(operations) == 1): + hints.append((tag_('The owner will remain %(current_owner)s', current_owner=formatted_current_owner) if current_owner else _('The ticket will remain with no owner'))) + elif (ticket['status'] is None): + hints.append(tag_(""The status will be '%(name)s'"", name=status)) + elif (status != '*'): + hints.append(tag_(""Next status will be '%(name)s'"", name=status)) + return (this_action['label'], tag(separated(control, ' ')), tag((separated(hints, '. ', '.') if hints else '')))",original,edgewall,Correct,train +27,edgewall/trac/trac/ticket/default_workflow.py/ConfigurableTicketWorkflow.render_ticket_action_control,"def render_ticket_action_control(self, req, ticket, action): + self.log.debug('render_ticket_action_control: action ""%s""', action) + this_action = self.actions[action] + status = this_action['newstate'] + operations = this_action['operations'] + current_owner = ticket._old.get('owner', ticket['owner']) + author = get_reporter_id(req, 'author') + author_info = partial(Chrome(self.env).authorinfo, req, resource=ticket.resource) + format_author = partial(Chrome(self.env).format_author, req, resource=ticket.resource) + formatted_current_owner = author_info(current_owner) + exists = (ticket._old.get('status', ticket['status']) is not None) + control = [] + hints = [] + if ('reset_workflow' in operations): + control.append(_('from invalid state')) + hints.append(_('Current state no longer exists')) + if ('del_owner' in operations): + hints.append(_('The ticket will be disowned')) + if (('set_owner' in operations) or ('may_set_owner' in operations)): + if ('set_owner' in this_action): + owners = self._to_users(this_action['set_owner'], ticket) + elif self.config.getbool('ticket', 'restrict_owner'): + perm = PermissionSystem(self.env) + owners = selected_owner.get_users_with_permission('TICKET_MODIFY') + owners = [user for user in owners if ('TICKET_MODIFY' in PermissionCache(self.env, user, ticket.resource))] + owners = sorted(owners) + else: + owners = None + if ('set_owner' in operations): + default_owner = author + elif ('may_set_owner' in operations): + if (not exists): + default_owner = TicketSystem(self.env).default_owner + else: + default_owner = ticket._old.get('owner', (ticket['owner'] or None)) + if ((owners is not None) and (default_owner not in owners)): + owners.insert(0, default_owner) + else: + raise AssertionError(operations) + id = ('action_%s_reassign_owner' % action) + if (not owners): + owner = req.args.get(id, default_owner) + control.append(tag_('to %(owner)s', owner=tag.input(type='text', id=id, name=id, value=owner))) + if ((not exists) or (current_owner is None)): + hints.append(_('The owner will be the specified user')) + else: + hints.append(tag_('The owner will be changed from %(current_owner)s to the specified user', current_owner=formatted_current_owner)) + elif (len(owners) == 1): + owner = tag.input(type='hidden', id=id, name=id, value=owners[0]) + formatted_new_owner = author_info(owners[0]) + control.append(tag_('to %(owner)s', owner=tag(formatted_new_owner, owner))) + if ((not exists) or (current_owner is None)): + hints.append(tag_('The owner will be %(new_owner)s', new_owner=formatted_new_owner)) + elif (ticket['owner'] != owners[0]): + hints.append(tag_('The owner will be changed from %(current_owner)s to %(new_owner)s', current_owner=formatted_current_owner, new_owner=formatted_new_owner)) + else: + selected_owner = req.args.get(id, default_owner) + control.append(tag_('to %(owner)s', owner=tag.select([tag.option(format_author(x), value=(x if (x is not None) else ''), selected=((x == selected_owner) or None)) for x in owners], id=id, name=id))) + if ((not exists) or (current_owner is None)): + hints.append(_('The owner will be the selected user')) + else: + hints.append(tag_('The owner will be changed from %(current_owner)s to the selected user', current_owner=formatted_current_owner)) + elif (('set_owner_to_self' in operations) and (ticket._old.get('owner', ticket['owner']) != author)): + formatted_author = author_info(author) + if ((not exists) or (current_owner is None)): + hints.append(tag_('The owner will be %(new_owner)s', new_owner=formatted_author)) + else: + hints.append(tag_('The owner will be changed from %(current_owner)s to %(new_owner)s', current_owner=formatted_current_owner, new_owner=formatted_author)) + if ('set_resolution' in operations): + if ('set_resolution' in this_action): + resolutions = this_action['set_resolution'] + else: + resolutions = [r.name for r in Resolution.select(self.env)] + if (not resolutions): + raise TracError(_('Your workflow attempts to set a resolution but none is defined (configuration issue, please contact your Trac admin).')) + id = ('action_%s_resolve_resolution' % action) + if (len(resolutions) == 1): + resolution = tag.input(type='hidden', id=id, name=id, value=resolutions[0]) + control.append(tag_('as %(resolution)s', resolution=tag(resolutions[0], resolution))) + hints.append(tag_('The resolution will be set to %(name)s', name=resolutions[0])) + else: + selected_option = req.args.get(id, TicketSystem(self.env).default_resolution) + control.append(tag_('as %(resolution)s', resolution=tag.select([tag.option(x, value=x, selected=((x == selected_option) or None)) for x in resolutions], id=id, name=id))) + hints.append(_('The resolution will be set')) + if ('del_resolution' in operations): + hints.append(_('The resolution will be deleted')) + if ('leave_status' in operations): + control.append(tag_('as %(status)s', status=ticket._old.get('status', ticket['status']))) + if (len(operations) == 1): + hints.append((tag_('The owner will remain %(current_owner)s', current_owner=formatted_current_owner) if current_owner else _('The ticket will remain with no owner'))) + elif (ticket['status'] is None): + hints.append(tag_(""The status will be '%(name)s'"", name=status)) + elif (status != '*'): + hints.append(tag_(""Next status will be '%(name)s'"", name=status)) + return (this_action['label'], tag(separated(control, ' ')), tag((separated(hints, '. ', '.') if hints else '')))",VarMisuse@273/409 `perm`->`selected_owner`,edgewall,Variable misuse,train +28,sshwsfc/xadmin/xadmin/sites.py/AdminSite.get_urls,"def get_urls(self): + from django.conf.urls import url, include + from xadmin.views.base import BaseAdminView + if settings.DEBUG: + self.check_dependencies() + + def wrap(view, cacheable=False): + + def wrapper(*args, **kwargs): + return self.admin_view(view, cacheable)(*args, **kwargs) + return update_wrapper(wrapper, view) + urlpatterns = [url('^jsi18n/$', wrap(self.i18n_javascript, cacheable=True), name='jsi18n')] + urlpatterns += [url(path, (wrap(self.create_admin_view(clz_or_func)) if ((type(clz_or_func) == type) and issubclass(clz_or_func, BaseAdminView)) else include(clz_or_func(self))), name=name) for (path, clz_or_func, name) in self._registry_views] + for (model, admin_class) in self._registry.iteritems(): + view_urls = [url(path, wrap(self.create_model_admin_view(clz, model, admin_class)), name=(name % (model._meta.app_label, model._meta.model_name))) for (path, clz, name) in self._registry_modelviews] + urlpatterns += [url(('^%s/%s/' % (model._meta.app_label, model._meta.model_name)), include(view_urls))] + return urlpatterns",original,sshwsfc,Correct,train +29,sshwsfc/xadmin/xadmin/sites.py/AdminSite.get_urls,"def get_urls(self): + from django.conf.urls import url, include + from xadmin.views.base import BaseAdminView + if settings.DEBUG: + urlpatterns.check_dependencies() + + def wrap(view, cacheable=False): + + def wrapper(*args, **kwargs): + return self.admin_view(view, cacheable)(*args, **kwargs) + return update_wrapper(wrapper, view) + urlpatterns = [url('^jsi18n/$', wrap(self.i18n_javascript, cacheable=True), name='jsi18n')] + urlpatterns += [url(path, (wrap(self.create_admin_view(clz_or_func)) if ((type(clz_or_func) == type) and issubclass(clz_or_func, BaseAdminView)) else include(clz_or_func(self))), name=name) for (path, clz_or_func, name) in self._registry_views] + for (model, admin_class) in self._registry.iteritems(): + view_urls = [url(path, wrap(self.create_model_admin_view(clz, model, admin_class)), name=(name % (model._meta.app_label, model._meta.model_name))) for (path, clz, name) in self._registry_modelviews] + urlpatterns += [url(('^%s/%s/' % (model._meta.app_label, model._meta.model_name)), include(view_urls))] + return urlpatterns",VarMisuse@37/43 `self`->`urlpatterns`,sshwsfc,Variable misuse,train +30,openworm/PyOpenWorm/PyOpenWorm/dataObject.py/DataObject.ObjectProperty,"@classmethod +def ObjectProperty(cls, *args, **kwargs): + ' Create a SimpleProperty that has a complex DataObject as its value\n\n Parameters\n ----------\n linkName : string\n The name of this Property.\n owner : PyOpenWorm.dataObject.DataObject\n The name of this Property.\n value_type : type\n The type of DataObject for values of this property\n ' + return cls._create_property(*args, property_type='ObjectProperty', **kwargs)",original,openworm,Correct,train +31,openworm/PyOpenWorm/PyOpenWorm/dataObject.py/DataObject.ObjectProperty,"@classmethod +def ObjectProperty(cls, *args, **kwargs): + ' Create a SimpleProperty that has a complex DataObject as its value\n\n Parameters\n ----------\n linkName : string\n The name of this Property.\n owner : PyOpenWorm.dataObject.DataObject\n The name of this Property.\n value_type : type\n The type of DataObject for values of this property\n ' + return cls._create_property(*args, property_type='ObjectProperty', **cls)",VarMisuse@34/161 `kwargs`->`cls`,openworm,Variable misuse,train +32,scipy/scipy/scipy/fftpack/tests/test_pseudo_diffs.py/direct_diff,"def direct_diff(x, k=1, period=None): + fx = fft(x) + n = len(fx) + if (period is None): + period = (2 * pi) + w = ((((fftfreq(n) * 2j) * pi) / period) * n) + if (k < 0): + w = (1 / (w ** k)) + w[0] = 0.0 + else: + w = (w ** k) + if (n > 2000): + w[250:(n - 250)] = 0.0 + return ifft((w * fx)).real",original,scipy,Correct,train +33,scipy/scipy/scipy/fftpack/tests/test_pseudo_diffs.py/direct_diff,"def direct_diff(x, k=1, period=None): + fx = fft(x) + n = len(fx) + if (period is None): + period = (2 * pi) + w = ((((fftfreq(n) * 2j) * pi) / period) * n) + if (k < 0): + w = (1 / (w ** k)) + fx[0] = 0.0 + else: + w = (w ** k) + if (n > 2000): + w[250:(n - 250)] = 0.0 + return ifft((w * fx)).real",VarMisuse@94/100 `w`->`fx`,scipy,Variable misuse,train +34,cloudera/hue/desktop/core/ext-py/pyopenssl/OpenSSL/crypto.py/X509StoreContext.verify_certificate,"def verify_certificate(self): + '\n Verify a certificate in a context.\n\n :param store_ctx: The :py:class:`X509StoreContext` to verify.\n :raises: Error\n ' + self._init() + ret = _lib.X509_verify_cert(self._store_ctx) + self._cleanup() + if (ret <= 0): + raise self._exception_from_context()",original,cloudera,Correct,train +35,cloudera/hue/desktop/core/ext-py/pyopenssl/OpenSSL/crypto.py/X509StoreContext.verify_certificate,"def verify_certificate(self): + '\n Verify a certificate in a context.\n\n :param store_ctx: The :py:class:`X509StoreContext` to verify.\n :raises: Error\n ' + ret._init() + ret = _lib.X509_verify_cert(self._store_ctx) + self._cleanup() + if (ret <= 0): + raise self._exception_from_context()",VarMisuse@12/68 `self`->`ret`,cloudera,Variable misuse,train +36,zzzeek/sqlalchemy/test/orm/inheritance/test_polymorphic_rel.py/_PolymorphicTestBase.test_filter_on_subclass_six,"def test_filter_on_subclass_six(self): + sess = create_session() + eq_(sess.query(Boss).filter((Boss.person_id == b1.person_id)).one(), Boss(name='pointy haired boss'))",original,zzzeek,Correct,train +37,zzzeek/sqlalchemy/test/orm/inheritance/test_polymorphic_rel.py/_PolymorphicTestBase.test_filter_on_subclass_six,"def test_filter_on_subclass_six(self): + sess = create_session() + eq_(self.query(Boss).filter((Boss.person_id == b1.person_id)).one(), Boss(name='pointy haired boss'))",VarMisuse@18/25 `sess`->`self`,zzzeek,Variable misuse,train +38,anandology/pyjamas/library/gwt/chart/Symbol.py/Symbol.setImageURL,"def setImageURL(self, imageURL): + self.imageURL = imageURL",original,anandology,Correct,train +39,anandology/pyjamas/library/gwt/chart/Symbol.py/Symbol.setImageURL,"def setImageURL(self, imageURL): + imageURL.imageURL = imageURL",VarMisuse@12/16 `self`->`imageURL`,anandology,Variable misuse,train +40,glue-viz/glue/glue/config.py/LinkFunctionRegistry.__call__,"def __call__(self, info='', output_labels=None): + out = (output_labels or []) + + def adder(func): + self.add(self.item(func, info, out)) + return func + return adder",original,glue-viz,Correct,train +41,glue-viz/glue/glue/config.py/LinkFunctionRegistry.__call__,"def __call__(self, info='', output_labels=None): + out = (out or []) + + def adder(func): + self.add(self.item(func, info, out)) + return func + return adder",VarMisuse@21/25 `output_labels`->`out`,glue-viz,Variable misuse,train +42,openstack/networking-cisco/networking_cisco/plugins/cisco/cfg_agent/cfg_agent.py/CiscoCfgAgent.hosting_devices_removed,"def hosting_devices_removed(self, context, payload): + 'Deal with hosting device removed RPC message.' + try: + if payload['hosting_data']: + if payload['hosting_data'].keys(): + self.process_services(removed_devices_info=payload) + except KeyError as e: + LOG.error(_LE('Invalid payload format for received RPC message `hosting_devices_removed`. Error is %(error)s. Payload is %(payload)s'), { + 'error': e, + 'payload': payload, + })",original,openstack,Correct,train +43,openstack/networking-cisco/networking_cisco/plugins/cisco/cfg_agent/cfg_agent.py/CiscoCfgAgent.hosting_devices_removed,"def hosting_devices_removed(self, context, payload): + 'Deal with hosting device removed RPC message.' + try: + if payload['hosting_data']: + if payload['hosting_data'].keys(): + self.process_services(removed_devices_info=payload) + except KeyError as e: + LOG.error(_LE('Invalid payload format for received RPC message `hosting_devices_removed`. Error is %(error)s. Payload is %(payload)s'), { + 'error': self, + 'payload': payload, + })",VarMisuse@72/155 `e`->`self`,openstack,Variable misuse,train +44,kuri65536/python-for-android/python-modules/twisted/twisted/web/test/test_web.py/ConditionalTest._modifiedTest,"def _modifiedTest(self, modifiedSince): + '\n Given the value C{modifiedSince} for the I{If-Modified-Since}\n header, verify that a response with a 200 code and the resource as\n the body is returned.\n ' + self.channel.lineReceived(('If-Modified-Since: ' + modifiedSince)) + self.channel.lineReceived('') + result = self.transport.getvalue() + self.failUnlessEqual(httpCode(result), http.OK) + self.failUnlessEqual(httpBody(result), 'correct')",original,kuri65536,Correct,train +45,kuri65536/python-for-android/python-modules/twisted/twisted/web/test/test_web.py/ConditionalTest._modifiedTest,"def _modifiedTest(self, modifiedSince): + '\n Given the value C{modifiedSince} for the I{If-Modified-Since}\n header, verify that a response with a 200 code and the resource as\n the body is returned.\n ' + self.channel.lineReceived(('If-Modified-Since: ' + modifiedSince)) + result.channel.lineReceived('') + result = self.transport.getvalue() + self.failUnlessEqual(httpCode(result), http.OK) + self.failUnlessEqual(httpBody(result), 'correct')",VarMisuse@27/113 `self`->`result`,kuri65536,Variable misuse,train +46,openstack/nova/nova/tests/unit/virt/xenapi/test_vm_utils.py/CheckVDISizeTestCase.test_zero_root_gb_disables_check,"def test_zero_root_gb_disables_check(self): + with mock.patch.object(self.instance, 'get_flavor') as get: + self.flavor.root_gb = 0 + get.return_value = self.flavor + vm_utils._check_vdi_size(self.context, self.session, self.instance, self.vdi_uuid)",original,openstack,Correct,train +47,openstack/nova/nova/tests/unit/virt/xenapi/test_vm_utils.py/CheckVDISizeTestCase.test_zero_root_gb_disables_check,"def test_zero_root_gb_disables_check(self): + with mock.patch.object(self.instance, 'get_flavor') as get: + self.flavor.root_gb = 0 + get.return_value = get.flavor + vm_utils._check_vdi_size(self.context, self.session, self.instance, self.vdi_uuid)",VarMisuse@40/55 `self`->`get`,openstack,Variable misuse,train +48,zbyte64/django-dockit/dockit/forms/widgets.py/PrimitiveFormsetListWidget.render,"def render(self, name, value, attrs=None): + if (not isinstance(value, list)): + value = self.decompress(value) + final_attrs = self.build_attrs(attrs) + formset_class = self.get_formset_class() + initial = [{ + 'value': val, + } for val in value] + formset = formset_class(initial=initial, prefix=name) + parts = [('
%s
' % form.as_table()) for form in formset] + parts.append(('
%s
' % (name, formset.empty_form.as_table()))) + output = ('%s %s' % (flatatt(final_attrs), name, formset.management_form, ''.join(parts))) + return mark_safe(output)",original,zbyte64,Correct,train +49,zbyte64/django-dockit/dockit/forms/widgets.py/PrimitiveFormsetListWidget.render,"def render(self, name, value, attrs=None): + if (not isinstance(value, list)): + value = self.decompress(value) + final_attrs = self.build_attrs(attrs) + formset_class = self.get_formset_class() + initial = [{ + 'value': val, + } for val in value] + formset = formset_class(initial=initial, prefix=name) + parts = [('
%s
' % form.as_table()) for form in formset] + parts.append(('
%s
' % (name, formset.empty_form.as_table()))) + output = ('%s %s' % (flatatt(parts), name, formset.management_form, ''.join(parts))) + return mark_safe(output)",VarMisuse@135/241 `final_attrs`->`parts`,zbyte64,Variable misuse,train +50,ecdavis/pants/pants/http/client.py/HTTPResponse.text,"@property +def text(self): + '\n The content of the response, after being decoded into unicode with\n :attr:`encoding`. Be careful when using this with large responses, as\n it will load the entire response into memory. ``None`` if no data has\n been received.\n\n If :attr:`encoding` is ``None``, this will default to ``UTF-8``.\n\n ' + raw = self.content + if (not raw): + return raw + encoding = self.encoding + if (not encoding): + encoding = 'utf-8' + return raw.decode(encoding)",original,ecdavis,Correct,train +51,ecdavis/pants/pants/http/client.py/HTTPResponse.text,"@property +def text(self): + '\n The content of the response, after being decoded into unicode with\n :attr:`encoding`. Be careful when using this with large responses, as\n it will load the entire response into memory. ``None`` if no data has\n been received.\n\n If :attr:`encoding` is ``None``, this will default to ``UTF-8``.\n\n ' + raw = self.content + if (not raw): + return raw + encoding = self.encoding + if (not encoding): + encoding = 'utf-8' + return raw.decode(raw)",VarMisuse@57/190 `encoding`->`raw`,ecdavis,Variable misuse,train +52,anhstudios/swganh/data/scripts/templates/object/tangible/loot/misc/shared_corsec_id_badge.py/create,"def create(kernel): + result = Tangible() + result.template = 'object/tangible/loot/misc/shared_corsec_id_badge.iff' + result.attribute_template_id = (- 1) + result.stfName('item_n', 'corsec_id_badge') + return result",original,anhstudios,Correct,train +53,anhstudios/swganh/data/scripts/templates/object/tangible/loot/misc/shared_corsec_id_badge.py/create,"def create(kernel): + result = Tangible() + result.template = 'object/tangible/loot/misc/shared_corsec_id_badge.iff' + kernel.attribute_template_id = (- 1) + result.stfName('item_n', 'corsec_id_badge') + return result",VarMisuse@22/45 `result`->`kernel`,anhstudios,Variable misuse,train +54,dbcli/pgcli/tests/test_fuzzy_completion.py/test_matching_should_be_case_insensitive,"def test_matching_should_be_case_insensitive(completer): + ""Fuzzy matching should keep matches even if letter casing doesn't match.\n\n This test checks that variations of the text which have different casing\n are still matched.\n "" + text = 'foo' + collection = ['Foo', 'FOO', 'fOO'] + matches = completer.find_matches(text, collection) + assert (len(matches) == 3)",original,dbcli,Correct,train +55,dbcli/pgcli/tests/test_fuzzy_completion.py/test_matching_should_be_case_insensitive,"def test_matching_should_be_case_insensitive(completer): + ""Fuzzy matching should keep matches even if letter casing doesn't match.\n\n This test checks that variations of the text which have different casing\n are still matched.\n "" + text = 'foo' + collection = ['Foo', 'FOO', 'fOO'] + matches = completer.find_matches(completer, collection) + assert (len(matches) == 3)",VarMisuse@32/116 `text`->`completer`,dbcli,Variable misuse,train +56,statsmodels/statsmodels/statsmodels/sandbox/panel/panel_short.py/ShortPanelGLS.__init__,"def __init__(self, endog, exog, group, sigma_i=None): + self.group = GroupSorted(group) + self.n_groups = self.group.n_groups + nobs_i = (len(endog) / self.n_groups) + if (sigma_i is None): + sigma_i = np.eye(int(nobs_i)) + self.cholsigmainv_i = np.linalg.cholesky(np.linalg.pinv(sigma_i)).T + super(self.__class__, self).__init__(endog, exog, sigma=None)",original,statsmodels,Correct,train +57,statsmodels/statsmodels/statsmodels/sandbox/panel/panel_short.py/ShortPanelGLS.__init__,"def __init__(self, endog, exog, group, sigma_i=None): + self.group = GroupSorted(group) + self.n_groups = self.group.n_groups + nobs_i = (len(endog) / self.n_groups) + if (sigma_i is None): + sigma_i = np.eye(int(nobs_i)) + self.cholsigmainv_i = np.linalg.cholesky(np.linalg.pinv(sigma_i)).T + super(self.__class__, self).__init__(sigma_i, exog, sigma=None)",VarMisuse@107/141 `endog`->`sigma_i`,statsmodels,Variable misuse,train +58,jpscaletti/authcode/tests/test_auth.py/test_hash_password,"def test_hash_password(): + p = 'password' + auth = authcode.Auth(SECRET_KEY, hash='pbkdf2_sha512', rounds=345) + hashed = auth.hash_password(p) + assert hashed.startswith('$pbkdf2-sha512$345$') + assert auth.password_is_valid(p, hashed) + assert (not auth.password_is_valid(p, 'lalala')) + assert (not auth.password_is_valid(p, None)) + assert (not auth.password_is_valid(None, 'lalala')) + assert (not auth.password_is_valid(None, None))",original,jpscaletti,Correct,train +59,jpscaletti/authcode/tests/test_auth.py/test_hash_password,"def test_hash_password(): + p = 'password' + auth = authcode.Auth(SECRET_KEY, hash='pbkdf2_sha512', rounds=345) + hashed = auth.hash_password(p) + assert hashed.startswith('$pbkdf2-sha512$345$') + assert auth.password_is_valid(p, hashed) + assert (not auth.password_is_valid(p, 'lalala')) + assert (not auth.password_is_valid(p, None)) + assert (not p.password_is_valid(None, 'lalala')) + assert (not auth.password_is_valid(None, None))",VarMisuse@86/124 `auth`->`p`,jpscaletti,Variable misuse,train +60,berrange/gerrymander/gerrymander/reports.py/ReportOpenReviewStats.older_than,"@staticmethod +def older_than(changes, ages, cutoffdays): + cutoff = (((cutoffdays * 24) * 60) * 60) + older = 0 + for change in changes: + if (ages[change] > cutoff): + older = (older + 1) + return older",original,berrange,Correct,train +61,berrange/gerrymander/gerrymander/reports.py/ReportOpenReviewStats.older_than,"@staticmethod +def older_than(changes, ages, cutoffdays): + cutoff = (((cutoffdays * 24) * 60) * 60) + older = 0 + for change in changes: + if (cutoffdays[change] > cutoff): + older = (older + 1) + return older",VarMisuse@46/54 `ages`->`cutoffdays`,berrange,Variable misuse,train +62,hoytak/treedict/tests/test_setting.py/TestSetting.testFromKeys_03_checking_02,"def testFromKeys_03_checking_02(self): + key_iterable = 'alphabet1' + + def f(): + p = TreeDict.fromkeys(key_iterable) + self.assertRaises(NameError, f)",original,hoytak,Correct,train +63,hoytak/treedict/tests/test_setting.py/TestSetting.testFromKeys_03_checking_02,"def testFromKeys_03_checking_02(self): + key_iterable = 'alphabet1' + + def f(): + p = TreeDict.fromkeys(key_iterable) + key_iterable.assertRaises(NameError, f)",VarMisuse@32/52 `self`->`key_iterable`,hoytak,Variable misuse,train +64,jonaslorander/OctoPiPanel/pygbutton.py/PygButton._propSetVisible,"def _propSetVisible(self, setting): + self._visible = setting",original,jonaslorander,Correct,train +65,jonaslorander/OctoPiPanel/pygbutton.py/PygButton._propSetVisible,"def _propSetVisible(self, setting): + self._visible = self",VarMisuse@16/21 `setting`->`self`,jonaslorander,Variable misuse,train +66,ga4gh/server/ga4gh/backend.py/Backend.runSearchReads,"def runSearchReads(self, request): + '\n Runs the specified SearchReadsRequest.\n ' + return self.runSearchRequest(request, protocol.SearchReadsRequest, protocol.SearchReadsResponse, self.readsGenerator)",original,ga4gh,Correct,train +67,ga4gh/server/ga4gh/backend.py/Backend.runSearchReads,"def runSearchReads(self, request): + '\n Runs the specified SearchReadsRequest.\n ' + return self.runSearchRequest(self, protocol.SearchReadsRequest, protocol.SearchReadsResponse, self.readsGenerator)",VarMisuse@19/40 `request`->`self`,ga4gh,Variable misuse,train +68,AppScale/appscale/AppServer/lib/django-0.96/django/core/cache/backends/db.py/CacheClass.delete,"def delete(self, key): + cursor = connection.cursor() + cursor.execute(('DELETE FROM %s WHERE cache_key = %%s' % self._table), [key]) + transaction.commit_unless_managed()",original,AppScale,Correct,train +69,AppScale/appscale/AppServer/lib/django-0.96/django/core/cache/backends/db.py/CacheClass.delete,"def delete(self, key): + cursor = connection.cursor() + cursor.execute(('DELETE FROM %s WHERE cache_key = %%s' % self._table), [self]) + transaction.commit_unless_managed()",VarMisuse@33/53 `key`->`self`,AppScale,Variable misuse,train +70,openstack/taskflow/taskflow/engines/action_engine/executor.py/ParallelProcessTaskExecutor.__init__,"def __init__(self, executor=None, max_workers=None, dispatch_periodicity=None): + super(ParallelProcessTaskExecutor, self).__init__(executor=executor, max_workers=max_workers) + self._manager = _ViewableSyncManager() + self._dispatcher = _Dispatcher(dispatch_periodicity=dispatch_periodicity) + self._worker = None + self._queue = None",original,openstack,Correct,train +71,openstack/taskflow/taskflow/engines/action_engine/executor.py/ParallelProcessTaskExecutor.__init__,"def __init__(self, executor=None, max_workers=None, dispatch_periodicity=None): + super(ParallelProcessTaskExecutor, self).__init__(executor=executor, max_workers=max_workers) + dispatch_periodicity._manager = _ViewableSyncManager() + self._dispatcher = _Dispatcher(dispatch_periodicity=dispatch_periodicity) + self._worker = None + self._queue = None",VarMisuse@40/56 `self`->`dispatch_periodicity`,openstack,Variable misuse,train +72,openstack/python-openstacksdk/openstack/telemetry/v2/_proxy.py/Proxy.find_alarm,"def find_alarm(self, name_or_id, ignore_missing=True): + 'Find a single alarm\n\n :param name_or_id: The name or ID of a alarm.\n :param bool ignore_missing: When set to ``False``\n :class:`~openstack.exceptions.ResourceNotFound` will be\n raised when the resource does not exist.\n When set to ``True``, None will be returned when\n attempting to find a nonexistent resource.\n :returns: One :class:`~openstack.telemetry.v2.alarm.Alarm` or None\n ' + return self._find(_alarm.Alarm, name_or_id, ignore_missing=ignore_missing)",original,openstack,Correct,train +73,openstack/python-openstacksdk/openstack/telemetry/v2/_proxy.py/Proxy.find_alarm,"def find_alarm(self, name_or_id, ignore_missing=True): + 'Find a single alarm\n\n :param name_or_id: The name or ID of a alarm.\n :param bool ignore_missing: When set to ``False``\n :class:`~openstack.exceptions.ResourceNotFound` will be\n raised when the resource does not exist.\n When set to ``True``, None will be returned when\n attempting to find a nonexistent resource.\n :returns: One :class:`~openstack.telemetry.v2.alarm.Alarm` or None\n ' + return self._find(_alarm.Alarm, name_or_id, ignore_missing=self)",VarMisuse@31/209 `ignore_missing`->`self`,openstack,Variable misuse,train +74,google/rfmt/formatter/support.py/LayoutElement.NewLineSpace,"@staticmethod +def NewLineSpace(n): + return (lambda console: console.NewLineSpace(n))",original,google,Correct,train +75,google/rfmt/formatter/support.py/LayoutElement.NewLineSpace,"@staticmethod +def NewLineSpace(n): + return (lambda console: console.NewLineSpace(console))",VarMisuse@22/27 `n`->`console`,google,Variable misuse,train +76,tkanerva/openmail/OpenMail/openmail/myhtml.py/Input.__init__,"def __init__(self, *args, **kw): + HtmlElement.__init__(self, *args, **kw) + self.label = (self.attributes['label'] if ('label' in self.attributes) else self.attributes['name']) + if ('label' in self.attributes): + del self.attributes['label']",original,tkanerva,Correct,train +77,tkanerva/openmail/OpenMail/openmail/myhtml.py/Input.__init__,"def __init__(self, *args, **kw): + HtmlElement.__init__(self, *args, **kw) + self.label = (self.attributes['label'] if ('label' in self.attributes) else kw.attributes['name']) + if ('label' in self.attributes): + del self.attributes['label']",VarMisuse@49/57 `self`->`kw`,tkanerva,Variable misuse,train +78,zzzeek/sqlalchemy/test/orm/test_eager_relations.py/MixedSelfReferentialEagerTest.setup_mappers,"@classmethod +def setup_mappers(cls): + (b_table, a_table) = (cls.tables.b_table, cls.tables.a_table) + + class A(cls.Comparable): + pass + + class B(cls.Comparable): + pass + mapper(A, a_table) + mapper(B, b_table, properties={ + 'parent_b1': relationship(B, remote_side=[b_table.c.id], primaryjoin=(b_table.c.parent_b1_id == b_table.c.id), order_by=b_table.c.id), + 'parent_z': relationship(A, lazy=True), + 'parent_b2': relationship(B, remote_side=[b_table.c.id], primaryjoin=(b_table.c.parent_b2_id == b_table.c.id), order_by=b_table.c.id), + })",original,zzzeek,Correct,train +79,zzzeek/sqlalchemy/test/orm/test_eager_relations.py/MixedSelfReferentialEagerTest.setup_mappers,"@classmethod +def setup_mappers(cls): + (b_table, a_table) = (cls.tables.b_table, cls.tables.a_table) + + class A(cls.Comparable): + pass + + class B(cls.Comparable): + pass + mapper(A, a_table) + mapper(B, b_table, properties={ + 'parent_b1': relationship(B, remote_side=[b_table.c.id], primaryjoin=(b_table.c.parent_b1_id == b_table.c.id), order_by=cls.c.id), + 'parent_z': relationship(A, lazy=True), + 'parent_b2': relationship(B, remote_side=[b_table.c.id], primaryjoin=(b_table.c.parent_b2_id == b_table.c.id), order_by=b_table.c.id), + })",VarMisuse@112/148 `b_table`->`cls`,zzzeek,Variable misuse,train +80,Yelp/fullerite/src/diamond/collectors/icinga_stats/icinga_stats.py/IcingaStatsCollector._trans_svc_state,"def _trans_svc_state(self, state): + ' Translate/validate Service state ' + if (state == 0): + return 'ok' + elif (state == 1): + return 'warning' + elif (state == 2): + return 'critical' + else: + return 'unknown'",original,Yelp,Correct,train +81,Yelp/fullerite/src/diamond/collectors/icinga_stats/icinga_stats.py/IcingaStatsCollector._trans_svc_state,"def _trans_svc_state(self, state): + ' Translate/validate Service state ' + if (state == 0): + return 'ok' + elif (self == 1): + return 'warning' + elif (state == 2): + return 'critical' + else: + return 'unknown'",VarMisuse@29/46 `state`->`self`,Yelp,Variable misuse,train +82,circus-tent/circus/circus/tests/test_watcher.py/TestWatcherInitialization.test_copy_env,"@tornado.testing.gen_test +def test_copy_env(self): + old_environ = os.environ + try: + os.environ = { + 'COCONUTS': 'MIGRATE', + } + watcher = Watcher('foo', 'foobar', copy_env=True) + self.assertEqual(watcher.env, os.environ) + watcher = Watcher('foo', 'foobar', copy_env=True, env={ + 'AWESOMENESS': 'YES', + }) + self.assertEqual(watcher.env, { + 'COCONUTS': 'MIGRATE', + 'AWESOMENESS': 'YES', + }) + finally: + os.environ = old_environ",original,circus-tent,Correct,train +83,circus-tent/circus/circus/tests/test_watcher.py/TestWatcherInitialization.test_copy_env,"@tornado.testing.gen_test +def test_copy_env(self): + old_environ = os.environ + try: + os.environ = { + 'COCONUTS': 'MIGRATE', + } + watcher = Watcher('foo', 'foobar', copy_env=True) + old_environ.assertEqual(watcher.env, os.environ) + watcher = Watcher('foo', 'foobar', copy_env=True, env={ + 'AWESOMENESS': 'YES', + }) + self.assertEqual(watcher.env, { + 'COCONUTS': 'MIGRATE', + 'AWESOMENESS': 'YES', + }) + finally: + os.environ = old_environ",VarMisuse@53/74 `self`->`old_environ`,circus-tent,Variable misuse,train \ No newline at end of file diff --git a/scripts/data_extraction/process_sourcetrail.sh b/scripts/data_extraction/process_sourcetrail.sh deleted file mode 100644 index 52ff257c..00000000 --- a/scripts/data_extraction/process_sourcetrail.sh +++ /dev/null @@ -1,69 +0,0 @@ -conda activate SourceCodeTools - -BPE_PATH="$2" -ENVS_DIR=$(realpath "$1") -RUN_DIR=$(realpath "$(dirname "$0")") -SQL_Q=$(realpath "$RUN_DIR/extract.sql") - -#echo $RUN_DIR -#echo $SQL_Q -#echo $ENVS_DIR - -#get directories -#https://stackoverflow.com/questions/2107945/how-to-loop-over-directories-in-linux - -for dir in "$ENVS_DIR"/*; do - if [ -d "$dir" ]; then - package_name="$(basename "$dir")" - - echo "Found package $package_name" - - if [ -f "$dir/source_graph_bodies.csv" ]; then - rm "$dir/source_graph_bodies.csv" - fi - if [ -f "$dir/nodes_with_ast.csv" ]; then - rm "$dir/nodes_with_ast.csv" - fi - if [ -f "$dir/edges_with_ast.csv" ]; then - rm "$dir/edges_with_ast.csv" - fi - if [ -f "$dir/call_seq.csv" ]; then - rm "$dir/call_seq.csv" - fi - if [ -f "$dir/source_graph_function_variable_pairs.csv" ]; then - rm "$dir/source_graph_function_variable_pairs.csv" - fi - - - if [ -f "$dir/$package_name.srctrldb" ]; then - cd "$dir" - sqlite3 "$dir/$package_name.srctrldb" < "$SQL_Q" - cd "$RUN_DIR" - sourcetrail_verify_files.py "$dir" - sourcetrail_node_name_merge.py "$dir/nodes.csv" - sourcetrail_decode_edge_types.py "$dir/edges.csv" - sourcetrail_filter_ambiguous_edges.py $dir - sourcetrail_parse_bodies.py "$dir" - sourcetrail_call_seq_extractor.py "$dir" - - - sourcetrail_add_reverse_edges.py "$dir/edges.bz2" - if [ -n "$BPE_PATH" ]; then - BPE_PATH=$(realpath "$BPE_PATH") - sourcetrail_ast_edges.py "$dir" -bpe $BPE_PATH --create_subword_instances - else - sourcetrail_ast_edges.py "$dir" - fi - sourcetrail_extract_variable_names.py python "$dir" - - if [ -f "$dir"/edges_with_ast_temp.csv ]; then - rm "$dir"/edges_with_ast_temp.csv - fi - - else - echo "Package not indexed" - fi - fi -done - -conda deactivate \ No newline at end of file diff --git a/scripts/datasets/cubert_benchmarks/prepare.sh b/scripts/datasets/cubert_benchmarks/prepare.sh new file mode 100644 index 00000000..ed54d149 --- /dev/null +++ b/scripts/datasets/cubert_benchmarks/prepare.sh @@ -0,0 +1,12 @@ +CUBERT_DATA_PATH=$(realpath "$1") +BPE_TOKENIZER=$(realpath "$2") +OUTPUT_PATH=$(realpath "$3") +RUN_DIR=$(realpath "$(dirname "$0")") + +KEEP_FRAC=0.10 + +python "$RUN_DIR/../../../SourceCodeTools/code/data/cubert_python_benchmarks/prepare_for_ast_parser.py" $CUBERT_DATA_PATH +python "$RUN_DIR/../../../SourceCodeTools/code/data/cubert_python_benchmarks/convert_for_ast_graph_builder.py" $CUBERT_DATA_PATH/variable_misuse.json $CUBERT_DATA_PATH/variable_misuse.csv +python "$RUN_DIR/../../../SourceCodeTools/code/data/ast_graph/build_ast_graph.py" --bpe_tokenizer $BPE_TOKENIZER --track_offsets $CUBERT_DATA_PATH/variable_misuse.csv $OUTPUT_PATH --chunksize 100000 --keep_frac $KEEP_FRAC --seed 42 --do_extraction +python "$RUN_DIR/../../../SourceCodeTools/code/data/cubert_python_benchmarks/variable_misuse_node_level_labels.py" $OUTPUT_PATH/with_ast +python "$RUN_DIR/../../../SourceCodeTools/code/data/cubert_python_benchmarks/extract_partition.py" $OUTPUT_PATH/with_ast/common_filecontent.json diff --git a/scripts/data_aggregation/component_analysis.sh b/scripts/datasets/source_graph/data_aggregation/component_analysis.sh similarity index 100% rename from scripts/data_aggregation/component_analysis.sh rename to scripts/datasets/source_graph/data_aggregation/component_analysis.sh diff --git a/scripts/data_aggregation/extra.sh b/scripts/datasets/source_graph/data_aggregation/extra.sh similarity index 100% rename from scripts/data_aggregation/extra.sh rename to scripts/datasets/source_graph/data_aggregation/extra.sh diff --git a/scripts/data_aggregation/join_type_names.py b/scripts/datasets/source_graph/data_aggregation/join_type_names.py similarity index 100% rename from scripts/data_aggregation/join_type_names.py rename to scripts/datasets/source_graph/data_aggregation/join_type_names.py diff --git a/scripts/data_aggregation/merge_graphs.sh b/scripts/datasets/source_graph/data_aggregation/merge_graphs.sh similarity index 100% rename from scripts/data_aggregation/merge_graphs.sh rename to scripts/datasets/source_graph/data_aggregation/merge_graphs.sh diff --git a/scripts/data_aggregation/merge_graphs_only_annotations.sh b/scripts/datasets/source_graph/data_aggregation/merge_graphs_only_annotations.sh similarity index 100% rename from scripts/data_aggregation/merge_graphs_only_annotations.sh rename to scripts/datasets/source_graph/data_aggregation/merge_graphs_only_annotations.sh diff --git a/scripts/data_aggregation/remove_annotation_edges.sh b/scripts/datasets/source_graph/data_aggregation/remove_annotation_edges.sh similarity index 100% rename from scripts/data_aggregation/remove_annotation_edges.sh rename to scripts/datasets/source_graph/data_aggregation/remove_annotation_edges.sh diff --git a/scripts/data_annotation/merge_annotation_texts.sh b/scripts/datasets/source_graph/data_annotation/merge_annotation_texts.sh similarity index 100% rename from scripts/data_annotation/merge_annotation_texts.sh rename to scripts/datasets/source_graph/data_annotation/merge_annotation_texts.sh diff --git a/scripts/data_annotation/run.sh b/scripts/datasets/source_graph/data_annotation/run.sh similarity index 100% rename from scripts/data_annotation/run.sh rename to scripts/datasets/source_graph/data_annotation/run.sh diff --git a/scripts/data_collection/generate_random_walks.jl b/scripts/datasets/source_graph/data_collection/generate_random_walks.jl similarity index 100% rename from scripts/data_collection/generate_random_walks.jl rename to scripts/datasets/source_graph/data_collection/generate_random_walks.jl diff --git a/scripts/data_collection/python/check_pip_exist.sh b/scripts/datasets/source_graph/data_collection/python/000_check_pip_exist.sh similarity index 100% rename from scripts/data_collection/python/check_pip_exist.sh rename to scripts/datasets/source_graph/data_collection/python/000_check_pip_exist.sh diff --git a/scripts/data_collection/python/install_envs.sh b/scripts/datasets/source_graph/data_collection/python/001_install_envs.sh similarity index 100% rename from scripts/data_collection/python/install_envs.sh rename to scripts/datasets/source_graph/data_collection/python/001_install_envs.sh diff --git a/scripts/datasets/source_graph/data_collection/python/002_process_folders.sh b/scripts/datasets/source_graph/data_collection/python/002_process_folders.sh new file mode 100644 index 00000000..09f42a41 --- /dev/null +++ b/scripts/datasets/source_graph/data_collection/python/002_process_folders.sh @@ -0,0 +1,59 @@ +# this script processes arbitrary code with sourcetrails, no dependencies are pulled + +conda activate SourceCodeTools + +create_sourcetrail_project_if_not_exist () { + if [ ! -f $1 ]; then + echo "Creating Sourcetrail project for $repo" + echo " + + + + Python Source Group + . + + .py + + + . + + enabled + Python Source Group + + + 8 +" > $1 + fi +} + + +run_indexer () { + repo=$1 + if [ ! -f "$repo/sourcetrail.log" ]; then + run_indexing=true + else + find_edges=$(cat "$repo/sourcetrail.log" | grep " Edges") + if [ -z "$find_edges" ]; then + echo "Indexing was interrupted, recovering..." + run_indexing=true + else + run_indexing=false + fi + fi + + if $run_indexing; then + echo "Begin indexing" + Sourcetrail.sh index -i $repo/$repo.srctrlprj >> $repo/sourcetrail.log + else + echo "Already indexed" + fi +} + + +while read repo +do + create_sourcetrail_project_if_not_exist "$repo/$repo.srctrlprj" + run_indexer "$repo" +done < "${1:-/dev/stdin}" + +conda deactivate diff --git a/scripts/data_collection/python/repos_in_pip.txt b/scripts/datasets/source_graph/data_collection/python/csn_repos_in_pip.txt similarity index 100% rename from scripts/data_collection/python/repos_in_pip.txt rename to scripts/datasets/source_graph/data_collection/python/csn_repos_in_pip.txt diff --git a/scripts/data_collection/python/deprecated/commands.txt b/scripts/datasets/source_graph/data_collection/python/deprecated/commands.txt similarity index 100% rename from scripts/data_collection/python/deprecated/commands.txt rename to scripts/datasets/source_graph/data_collection/python/deprecated/commands.txt diff --git a/scripts/data_collection/python/packages.txt b/scripts/datasets/source_graph/data_collection/python/deprecated/packages.txt similarity index 100% rename from scripts/data_collection/python/packages.txt rename to scripts/datasets/source_graph/data_collection/python/deprecated/packages.txt diff --git a/scripts/datasets/source_graph/data_collection/python/scaa/process_authors.sh b/scripts/datasets/source_graph/data_collection/python/scaa/process_authors.sh new file mode 100644 index 00000000..b7180997 --- /dev/null +++ b/scripts/datasets/source_graph/data_collection/python/scaa/process_authors.sh @@ -0,0 +1,51 @@ +# this script processes arbitrary code with sourcetrails, no dependencies are pulled + +conda activate python37 + +while read repo +do + + if [ ! -f "$repo/$repo.srctrlprj" ]; then + echo "Creating Sourcetrail project for $repo" + echo " + + + + Python Source Group + . + + .py + + + . + + enabled + Python Source Group + + + 8 +" > $repo/$repo.srctrlprj + fi + + if [ ! -f "$repo/sourcetrail.log" ]; then + run_indexing=true + else + find_edges=$(cat "$repo/sourcetrail.log" | grep " Edges") + if [ -z "$find_edges" ]; then + echo "Indexing was interrupted, recovering..." + run_indexing=true + else + run_indexing=false + fi + fi + + if $run_indexing; then + echo "Begin indexing" + Sourcetrail.sh index -i $repo/$repo.srctrlprj >> $repo/sourcetrail.log + else + echo "Already indexed" + fi + +done < "${1:-/dev/stdin}" + +conda deactivate diff --git a/scripts/data_collection/python/test_envs.txt b/scripts/datasets/source_graph/data_collection/python/test_envs.txt similarity index 100% rename from scripts/data_collection/python/test_envs.txt rename to scripts/datasets/source_graph/data_collection/python/test_envs.txt diff --git a/scripts/datasets/source_graph/data_collection/python/versioned/install_versioned_envs.sh b/scripts/datasets/source_graph/data_collection/python/versioned/install_versioned_envs.sh new file mode 100644 index 00000000..a337f2fe --- /dev/null +++ b/scripts/datasets/source_graph/data_collection/python/versioned/install_versioned_envs.sh @@ -0,0 +1,120 @@ +conda activate python37 + +while read repo_link +do +# echo $repo/bin/python + # Create environment if does not exist + repo=$(echo $repo_link | awk -F"/" '{print $5}') + blob=$(echo $repo_link | awk -F"/" '{print $7}') + if [ ! -f "$repo/bin/python" ] + then + echo "Creating environment for $repo" + if [ -z "$(python -m venv $repo)" ] + then + echo "Created environment for $repo" + cr_env=true + else + cr_env=false + fi + else + echo "Environment for $repo exists" + cr_env=true + fi + + # Activate environment if created + if $cr_env; then + source $repo/bin/activate && act_env=true + echo "Activated $repo" + fi + + # Process + if $act_env; then + if [ ! -f "$repo/$repo.zip" ] + then + git checkout -b new_branch hash + wget "$repo_link" -O "$repo/$repo.zip" + unzip -d $repo "$repo/$repo.zip" + # mv $repo/$repo-$blob/* "$repo/" + fi + repo_path="$repo/$repo-$blob/" + + # install packages + if [ ! -f "$repo/packages.txt" ]; then + echo "Installing packages for $repo" + + pip install $repo_path > $repo/piplog.log 2>$repo/piperr.log +# pip install $repo > $repo/piplog.log + pip freeze > $repo/packages.txt + if [ -z "$(python -m venv $repo)" ] + then + rm $repo/packages.txt + fi + +# else +# cat $repo/packages.txt | xargs pip install > $repo/piplogsecondary.log + fi + + if [ ! -f "$repo/$repo.srctrlprj" ]; then + echo "Creating Sourcetrail project for $repo" + repo_lower=$(echo "$repo" | awk '{print tolower($0)}') + repo_lower_under="${repo_lower//[-]/_}" + if [ ! -d "$repo_lower_under" ] + then + repo_lower_under="${repo_lower//[-]/}" + if [ ! -d "$repo_lower_under" ] + then + touch "$repo/error-resolving-package-name" + fi + fi + echo " + + + + Python Source Group + . + + .py + + + lib/python3.7/site-packages/$repo_lower_under + + enabled + Python Source Group + + + 8 +" > $repo/$repo.srctrlprj + fi + + if [ ! -f "$repo/sourcetrail.log" ]; then + run_indexing=true + else + find_edges=$(cat "$repo/sourcetrail.log" | grep " Edges") + if [ -z "$find_edges" ]; then + echo "Indexing was interrupted, recovering..." + run_indexing=true + else + run_indexing=false + fi + fi + + if $run_indexing; then + echo "Begin indexing" + # try to install packages (maybe environment is empty) + cat $repo/packages.txt | xargs pip install > $repo/piplogsecondary.log + Sourcetrail.sh index -i $repo/$repo.srctrlprj >> $repo/sourcetrail.log + cat $repo/packages.txt | xargs pip uninstall -y > $repo/pipuninstlog.log + else + echo "Already indexed" + fi + fi + + # Deactivate environment if activated + if $act_env; then + deactivate + echo "Deactivated $repo" + fi + +done < "${1:-/dev/stdin}" + +conda deactivate diff --git a/scripts/datasets/source_graph/data_collection/python/versioned/versioned-zips-test.txt b/scripts/datasets/source_graph/data_collection/python/versioned/versioned-zips-test.txt new file mode 100644 index 00000000..defc0814 --- /dev/null +++ b/scripts/datasets/source_graph/data_collection/python/versioned/versioned-zips-test.txt @@ -0,0 +1,10 @@ +https://codeload.github.com/0101/pipetools/zip/42f71af0ecaeacee0f3d64c8706ddb1caacf8bc1 +https://codeload.github.com/02strich/django-auth-kerberos/zip/416c2e2fafe6f0dbddf79bef6f6dbb90b04ceedc +https://codeload.github.com/05bit/peewee-async/zip/d15f4629da1d9975da4ec37306188e68d288c862 +https://codeload.github.com/0compute/xtraceback/zip/5f4ae11cf21e6eea830d79aed66d3cd91bd013cd +https://codeload.github.com/0k/kids.cache/zip/668f3b966877c4a0855d60e05cc3706cf37e4570 +https://codeload.github.com/0k/kids.cmd/zip/bbe958556bc72e6579d4007a28064e2f62109bcf +https://codeload.github.com/0k/shyaml/zip/8dcf9bb39f995564271600a4dae1fc114077c2e0 +https://codeload.github.com/101Loop/drf-addons/zip/62392c72e8bce237f4140a2b7171e89984cb15c5 +https://codeload.github.com/10gen/mongo-orchestration/zip/81fd2224205922ea2178b08190b53a33aec47261 +https://codeload.github.com/1and1/infrascope/zip/d7e291917e618a0a3cd6d5bfc20c6b5defd7550c diff --git a/scripts/data_collection/requirements.txt b/scripts/datasets/source_graph/data_collection/requirements.txt similarity index 100% rename from scripts/data_collection/requirements.txt rename to scripts/datasets/source_graph/data_collection/requirements.txt diff --git a/scripts/data_extraction/extract.sql b/scripts/datasets/source_graph/data_extraction/extract.sql similarity index 100% rename from scripts/data_extraction/extract.sql rename to scripts/datasets/source_graph/data_extraction/extract.sql diff --git a/scripts/data_extraction/find_failed.sh b/scripts/datasets/source_graph/data_extraction/find_failed.sh similarity index 100% rename from scripts/data_extraction/find_failed.sh rename to scripts/datasets/source_graph/data_extraction/find_failed.sh diff --git a/scripts/datasets/source_graph/data_extraction/process_sourcetrail.sh b/scripts/datasets/source_graph/data_extraction/process_sourcetrail.sh new file mode 100644 index 00000000..4dfbc28e --- /dev/null +++ b/scripts/datasets/source_graph/data_extraction/process_sourcetrail.sh @@ -0,0 +1,69 @@ +conda activate SourceCodeTools + +BPE_PATH="$2" +ENVS_DIR=$(realpath "$1") +RUN_DIR=$(realpath "$(dirname "$0")") +SQL_Q=$(realpath "$RUN_DIR/extract.sql") + +#echo $RUN_DIR +#echo $SQL_Q +#echo $ENVS_DIR + +#get directories +#https://stackoverflow.com/questions/2107945/how-to-loop-over-directories-in-linux + +for dir in "$ENVS_DIR"/*; do + if [ -d "$dir" ]; then + package_name="$(basename "$dir")" + + echo "Found package $package_name" + +# if [ -f "$dir/source_graph_bodies.csv" ]; then +# rm "$dir/source_graph_bodies.csv" +# fi +# if [ -f "$dir/nodes_with_ast.csv" ]; then +# rm "$dir/nodes_with_ast.csv" +# fi +# if [ -f "$dir/edges_with_ast.csv" ]; then +# rm "$dir/edges_with_ast.csv" +# fi +# if [ -f "$dir/call_seq.csv" ]; then +# rm "$dir/call_seq.csv" +# fi +# if [ -f "$dir/source_graph_function_variable_pairs.csv" ]; then +# rm "$dir/source_graph_function_variable_pairs.csv" +# fi + + + if [ -f "$dir/$package_name.srctrldb" ]; then + cd "$dir" + sqlite3 "$dir/$package_name.srctrldb" < "$SQL_Q" + cd "$RUN_DIR" + sourcetrail_verify_files.py "$dir" +# sourcetrail_node_name_merge.py "$dir/nodes.csv" +# sourcetrail_decode_edge_types.py "$dir/edges.csv" +# sourcetrail_filter_ambiguous_edges.py $dir +# sourcetrail_parse_bodies.py "$dir" +# sourcetrail_call_seq_extractor.py "$dir" + + +# sourcetrail_add_reverse_edges.py "$dir/edges.bz2" +# if [ -n "$BPE_PATH" ]; then +# BPE_PATH=$(realpath "$BPE_PATH") +# sourcetrail_ast_edges.py "$dir" -bpe $BPE_PATH --create_subword_instances +# else +# sourcetrail_ast_edges.py "$dir" +# fi +# sourcetrail_extract_variable_names.py python "$dir" + +# if [ -f "$dir"/edges_with_ast_temp.csv ]; then +# rm "$dir"/edges_with_ast_temp.csv +# fi + + else + echo "Package not indexed" + fi + fi +done + +conda deactivate \ No newline at end of file diff --git a/scripts/datasets/source_graph/data_extraction/process_sourcetrail_v2.sh b/scripts/datasets/source_graph/data_extraction/process_sourcetrail_v2.sh new file mode 100644 index 00000000..a9679b14 --- /dev/null +++ b/scripts/datasets/source_graph/data_extraction/process_sourcetrail_v2.sh @@ -0,0 +1,24 @@ +conda activate SourceCodeTools + +ENVS_DIR=$(realpath "$1") +RUN_DIR=$(realpath "$(dirname "$0")") +SQL_Q=$(realpath "$RUN_DIR/extract.sql") + +for dir in "$ENVS_DIR"/*; do + if [ -d "$dir" ]; then + package_name="$(basename "$dir")" + + echo "Found package $package_name" + + if [ -f "$dir/$package_name.srctrldb" ]; then + cd "$dir" + sqlite3 "$dir/$package_name.srctrldb" < "$SQL_Q" + cd "$RUN_DIR" + sourcetrail_verify_files.py "$dir" + else + echo "Package not indexed" + fi + fi +done + +conda deactivate \ No newline at end of file diff --git a/scripts/training/dglke_log_parser.py b/scripts/training/dglke_log_parser.py new file mode 100644 index 00000000..68455e0e --- /dev/null +++ b/scripts/training/dglke_log_parser.py @@ -0,0 +1,79 @@ +import matplotlib.pyplot as plt +import numpy as np + +def main(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("logfile") + + args = parser.parse_args() + + with open(args.logfile, "r") as logfile: + steps = [[]] + buffer_pos = [[]] + buffer_neg = [[]] + model_names = [] + scores = [{}] + for line in logfile: + if "pos_loss" in line: + buffer_pos[-1].append(float(line.split("pos_loss: ")[-1])) + steps[-1].append(float(line.split("(")[-1].split("/")[0])) + elif "neg_loss" in line: + buffer_neg[-1].append(float(line.split("neg_loss: ")[-1])) + elif "Save model" in line: + model_names.append(line.split("/")[-2]) + scores[-1]["Model"] = model_names[-1] + elif "Test average MRR" in line: + scores[-1]["MRR"] = line.split(":")[-1].strip() + elif "Test average MR" in line: + scores[-1]["MR"] = line.split(":")[-1].strip() + elif "Test average HITS@1:" in line: + scores[-1]["HITS@1"] = line.split(":")[-1].strip() + elif "Test average HITS@3:" in line: + scores[-1]["HITS@3"] = line.split(":")[-1].strip() + elif "Test average HITS@10:" in line: + scores[-1]["HITS@10"] = line.split(":")[-1].strip() + buffer_pos.append([]) + buffer_neg.append([]) + steps.append([]) + scores.append({}) + + for s, v in zip(steps, buffer_pos): + plt.plot(s, v) + plt.xlabel("Step") + plt.ylabel("Loss") + plt.gca().set_yscale("log") + plt.legend(model_names) + plt.grid() + plt.savefig("positive_loss.svg") + plt.close() + + for s, v in zip(steps, buffer_neg): + plt.plot(s, v) + plt.xlabel("Step") + plt.ylabel("Loss") + plt.gca().set_yscale("log") + plt.grid() + plt.legend(model_names) + plt.savefig("negative_loss.svg") + plt.close() + + for s, v, n in zip(steps, buffer_pos, buffer_neg): + plt.plot(s, np.array(v) + np.array(n)) + plt.xlabel("Step") + plt.ylabel("Loss") + plt.gca().set_yscale("log") + plt.grid() + plt.legend(model_names) + plt.savefig("overall_loss.svg") + plt.close() + + import pandas as pd + s = pd.DataFrame.from_records(scores) + print(s.to_string()) + + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/training/evaluate.py b/scripts/training/evaluate.py index 69e23432..a5425523 100644 --- a/scripts/training/evaluate.py +++ b/scripts/training/evaluate.py @@ -1,20 +1,44 @@ import json import logging import os +import shutil from datetime import datetime from os import mkdir -from os.path import isdir, join +from os.path import isdir -from SourceCodeTools.code.data.sourcetrail.Dataset import read_or_create_dataset -from SourceCodeTools.models.graph import RGCNSampling, RGAN, RGGAN +from SourceCodeTools.code.data.dataset.Dataset import read_or_create_gnn_dataset +from SourceCodeTools.models.graph import RGGAN from SourceCodeTools.models.graph.train.utils import get_name, get_model_base -from params import rgcnsampling_params, rggan_params +from SourceCodeTools.models.training_options import add_gnn_train_args + + +def detect_checkpoint_files(path): + checkpoints = [] + for file in os.listdir(path): + filepath = os.path.join(path, file) + if not os.path.isfile: + continue + + if not file.startswith("saved_state_"): + continue + + epoch = int(file.split("_")[2].split(".")[0]) + checkpoints.append((epoch, filepath)) + + checckpoints = sorted(checkpoints, key=lambda x: x[0]) + + return checckpoints def main(models, args): for model, param_grid in models.items(): for params in param_grid: + if args.h_dim is None: + params["h_dim"] = args.node_emb_size + else: + params["h_dim"] = args.h_dim + date_time = str(datetime.now()) print("\n\n") print(date_time) @@ -24,29 +48,26 @@ def main(models, args): model_base = get_model_base(args, model_attempt) - dataset = read_or_create_dataset(args=args, model_base=model_base) + dataset = read_or_create_gnn_dataset(args=args, model_base=model_base) + + from SourceCodeTools.models.graph.train.sampling_multitask2 import evaluation_procedure - if args.training_mode == "multitask": + checkpoints = detect_checkpoint_files(model_base) - if args.intermediate_supervision: - # params['use_self_loop'] = True # ???? - from SourceCodeTools.models.graph.train.sampling_multitask_intermediate_supervision import evaluation_procedure - else: - from SourceCodeTools.models.graph.train.sampling_multitask2 import evaluation_procedure + for epoch, ckpt_path in checkpoints: + shutil.copy(ckpt_path, os.path.join(model_base, "saved_state.pt")) evaluation_procedure(dataset, model, params, args, model_base) - else: - raise ValueError("Issue! ", args.training_mode) if __name__ == "__main__": import argparse - from train import add_train_args + # from train import add_train_args parser = argparse.ArgumentParser(description='Process some integers.') - add_train_args(parser) + add_gnn_train_args(parser) args = parser.parse_args() @@ -54,11 +75,17 @@ def main(models, args): model_out = args.model_output_dir - saved_args = json.loads(open(os.path.join(model_out, "metadata.json")).read()) - - models_ = { - eval(saved_args.pop("name").split("-")[0]): [saved_args.pop("parameters")] - } + try: + saved_args = json.loads(open(os.path.join(model_out, "metadata.json")).read()) + models_ = { + eval(saved_args.pop("name").split("-")[0]): [saved_args.pop("parameters")] + } + except: + param_keys = 'activation', 'use_self_loop', 'num_steps', 'dropout', 'num_bases', 'lr' + saved_args = json.loads(open(os.path.join(model_out, "params.json")).read()) + models_ = { + RGGAN: [{key: saved_args[key] for key in param_keys}] + } args.__dict__.update(saved_args) args.restore_state = True diff --git a/scripts/training/evaluate_only.py b/scripts/training/evaluate_only.py new file mode 100644 index 00000000..1a122480 --- /dev/null +++ b/scripts/training/evaluate_only.py @@ -0,0 +1,105 @@ +import json +import logging +from copy import copy +from datetime import datetime +from os import mkdir +from os.path import isdir, join + +from SourceCodeTools.code.data.dataset.Dataset import read_or_create_gnn_dataset +from SourceCodeTools.models.graph import RGGAN +from SourceCodeTools.models.graph.train.utils import get_name, get_model_base +from SourceCodeTools.models.training_options import add_gnn_train_args, verify_arguments +from params import rggan_params + + +def main(models, args): + for model, param_grid in models.items(): + for params in param_grid: + + if args.h_dim is None: + params["h_dim"] = args.node_emb_size + else: + params["h_dim"] = args.h_dim + + params["num_steps"] = args.n_layers + + date_time = str(datetime.now()) + print("\n\n") + print(date_time) + print(f"Model: {model.__name__}, Params: {params}") + + model_attempt = get_name(model, date_time) + + model_base = get_model_base(args, model_attempt) + + dataset = read_or_create_gnn_dataset(args=args, model_base=model_base) + + def write_params(args, params): + args = copy(args.__dict__) + args.update(params) + args['activation'] = args['activation'].__name__ + with open(join(model_base, "params.json"), "w") as mdata: + mdata.write(json.dumps(args, indent=4)) + + if not args.restore_state: + write_params(args, params) + + from SourceCodeTools.models.graph.train.sampling_multitask2 import training_procedure + + trainer, scores = \ + training_procedure(dataset, model, copy(params), args, model_base) + + trainer.save_checkpoint(model_base) + + print("Saving...") + + params['activation'] = params['activation'].__name__ + + metadata = { + "base": model_base, + "name": model_attempt, + "parameters": params, + "layers": "embeddings.pkl", + "mappings": "nodes.csv", + "state": "state_dict.pt", + "scores": scores, + "time": date_time, + } + + metadata.update(args.__dict__) + + # pickle.dump(dataset, open(join(model_base, "dataset.pkl"), "wb")) + import pickle + pickle.dump(trainer.get_embeddings(), open(join(model_base, metadata['layers']), "wb")) + + with open(join(model_base, "metadata.json"), "w") as mdata: + mdata.write(json.dumps(metadata, indent=4)) + + print("Done saving") + + +if __name__ == "__main__": + + import argparse + + parser = argparse.ArgumentParser(description='') + add_gnn_train_args(parser) + + args = parser.parse_args() + verify_arguments(args) + + logging.basicConfig(level=logging.INFO, format="%(asctime)s:%(levelname)s:%(module)s:%(lineno)d:%(message)s") + + models_ = { + # GCNSampling: gcnsampling_params, + # GATSampler: gatsampling_params, + # RGCNSampling: rgcnsampling_params, + # RGAN: rgcnsampling_params, + RGGAN: rggan_params + + } + + if not isdir(args.model_output_dir): + mkdir(args.model_output_dir) + + main(models_, args) diff --git a/scripts/training/params.py b/scripts/training/params.py index 85444fce..0a5a350c 100644 --- a/scripts/training/params.py +++ b/scripts/training/params.py @@ -68,12 +68,12 @@ rggan_grids = [ { - 'h_dim': [100], - 'num_bases': [-1], - 'num_steps': [5], - 'dropout': [0.0], - 'use_self_loop': [False], - 'activation': [torch.nn.functional.leaky_relu], # torch.nn.functional.hardswish], #[torch.nn.functional.hardtanh], #torch.nn.functional.leaky_relu + 'h_dim': [100], # set from cli + 'num_bases': [10], + 'num_steps': [9], # set from cli + 'dropout': [0.2], + 'use_self_loop': [True], + 'activation': [torch.tanh], # torch.nn.functional.hardswish], #[torch.nn.functional.hardtanh], #torch.nn.functional.leaky_relu 'lr': [1e-3], # 1e-4] } ] diff --git a/scripts/training/substitute_model.py b/scripts/training/substitute_model.py new file mode 100644 index 00000000..001d8898 --- /dev/null +++ b/scripts/training/substitute_model.py @@ -0,0 +1,122 @@ +import json +import logging +from copy import copy +from datetime import datetime +from os import mkdir +from os.path import isdir, join + +from SourceCodeTools.code.data.dataset.Dataset import read_or_create_gnn_dataset +from SourceCodeTools.models.graph import RGGAN +from SourceCodeTools.models.graph.train.utils import get_name, get_model_base +from SourceCodeTools.models.training_options import add_gnn_train_args, verify_arguments +from params import rggan_params + + +def main(models, args): + for model, param_grid in models.items(): + for params in param_grid: + + if args.h_dim is None: + params["h_dim"] = args.node_emb_size + else: + params["h_dim"] = args.h_dim + + params["num_steps"] = args.n_layers + + date_time = str(datetime.now()) + print("\n\n") + print(date_time) + print(f"Model: {model.__name__}, Params: {params}") + + model_attempt = get_name(model, date_time) + + model_base = get_model_base(args, model_attempt) + + dataset = read_or_create_gnn_dataset(args=args, model_base=model_base) + + if args.external_dataset is not None: + external_args = copy(args) + external_args.data_path = external_args.external_dataset + external_args.external_model_base = get_model_base(external_args, model_attempt, force_new=True) + def load_external_dataset(): + return external_args, read_or_create_gnn_dataset(args=external_args, + model_base=external_args.external_model_base, + force_new=True) + else: + load_external_dataset = None + + def write_params(args, params): + args = copy(args.__dict__) + args.update(params) + args['activation'] = args['activation'].__name__ + with open(join(model_base, "params.json"), "w") as mdata: + mdata.write(json.dumps(args, indent=4)) + + if not args.restore_state: + write_params(args, params) + + from SourceCodeTools.models.graph.train.sampling_multitask2 import training_procedure + + trainer, scores = \ + training_procedure(dataset, model, copy(params), args, model_base, load_external_dataset=load_external_dataset) + + if load_external_dataset is not None: + model_base = external_args.external_model_base + + trainer.save_checkpoint(model_base) + + print("Saving...") + + params['activation'] = params['activation'].__name__ + + metadata = { + "base": model_base, + "name": model_attempt, + "parameters": params, + "layers": "embeddings.pkl", + "mappings": "nodes.csv", + "state": "state_dict.pt", + "scores": scores, + "time": date_time, + } + + metadata.update(args.__dict__) + + # pickle.dump(dataset, open(join(model_base, "dataset.pkl"), "wb")) + import pickle + pickle.dump(trainer.get_embeddings(), open(join(model_base, metadata['layers']), "wb")) + + with open(join(model_base, "metadata.json"), "w") as mdata: + mdata.write(json.dumps(metadata, indent=4)) + + print("Done saving") + + + + + +if __name__ == "__main__": + + import argparse + + parser = argparse.ArgumentParser(description='') + add_gnn_train_args(parser) + + args = parser.parse_args() + verify_arguments(args) + + logging.basicConfig(level=logging.INFO, format="%(asctime)s:%(levelname)s:%(module)s:%(lineno)d:%(message)s") + + models_ = { + # GCNSampling: gcnsampling_params, + # GATSampler: gatsampling_params, + # RGCNSampling: rgcnsampling_params, + # RGAN: rgcnsampling_params, + RGGAN: rggan_params + + } + + if not isdir(args.model_output_dir): + mkdir(args.model_output_dir) + + main(models_, args) diff --git a/scripts/training/train.py b/scripts/training/train.py index 840fa42f..227a3752 100644 --- a/scripts/training/train.py +++ b/scripts/training/train.py @@ -5,13 +5,17 @@ from os import mkdir from os.path import isdir, join -from SourceCodeTools.code.data.sourcetrail.Dataset import read_or_create_dataset -from SourceCodeTools.models.graph import RGCNSampling, RGAN, RGGAN +from SourceCodeTools.code.data.dataset.Dataset import read_or_create_gnn_dataset +from SourceCodeTools.models.graph import RGGAN +from SourceCodeTools.models.graph.train.sampling_multitask2 import training_procedure from SourceCodeTools.models.graph.train.utils import get_name, get_model_base -from params import rgcnsampling_params, rggan_params +from SourceCodeTools.models.training_config import get_config, load_config, update_config, save_config +from SourceCodeTools.models.training_options import add_gnn_train_args, verify_arguments +from params import rggan_params -def main(models, args): +def train_grid(models, args): + for model, param_grid in models.items(): for params in param_grid: @@ -31,7 +35,7 @@ def main(models, args): model_base = get_model_base(args, model_attempt) - dataset = read_or_create_dataset(args=args, model_base=model_base) + dataset = read_or_create_gnn_dataset(args=args, model_base=model_base) def write_params(args, params): args = copy(args.__dict__) @@ -40,24 +44,17 @@ def write_params(args, params): with open(join(model_base, "params.json"), "w") as mdata: mdata.write(json.dumps(args, indent=4)) - write_params(args, params) - - if args.training_mode == "multitask": + if not args.restore_state: + write_params(args, params) - # if args.intermediate_supervision: - # # params['use_self_loop'] = True # ???? - # from SourceCodeTools.models.graph.train.sampling_multitask_intermediate_supervision import training_procedure - # else: - from SourceCodeTools.models.graph.train.sampling_multitask2 import training_procedure + from SourceCodeTools.models.graph.train.sampling_multitask2 import training_procedure - trainer, scores = \ - training_procedure(dataset, model, copy(params), args, model_base) + trainer, scores = \ + training_procedure(dataset, model, copy(params), args, model_base) - trainer.save_checkpoint(model_base) - else: - raise ValueError("Unknown training mode:", args.training_mode) + trainer.save_checkpoint(model_base) - print("Saving...", end="") + print("Saving...") params['activation'] = params['activation'].__name__ @@ -81,107 +78,98 @@ def write_params(args, params): with open(join(model_base, "metadata.json"), "w") as mdata: mdata.write(json.dumps(metadata, indent=4)) - print("done") - + print("Done saving") -def add_data_arguments(parser): - parser.add_argument('--data_path', '-d', dest='data_path', default=None, help='Path to the files') - parser.add_argument('--train_frac', dest='train_frac', default=0.9, type=float, help='') - parser.add_argument('--filter_edges', dest='filter_edges', default=None, help='Edges filtered before training') - parser.add_argument('--min_count_for_objectives', dest='min_count_for_objectives', default=5, type=int, help='') - parser.add_argument('--packages_file', dest='packages_file', default=None, type=str, help='') - parser.add_argument('--self_loops', action='store_true') - parser.add_argument('--use_node_types', action='store_true') - parser.add_argument('--use_edge_types', action='store_true') - parser.add_argument('--restore_state', action='store_true') - parser.add_argument('--no_global_edges', action='store_true') - parser.add_argument('--remove_reverse', action='store_true') +def main(args): -def add_pretraining_arguments(parser): - parser.add_argument('--pretrained', '-p', dest='pretrained', default=None, help='') - parser.add_argument('--tokenizer', '-t', dest='tokenizer', default=None, help='') - parser.add_argument('--pretraining_phase', dest='pretraining_phase', default=0, type=int, help='') + args = copy(args.__dict__) + config_path = args.pop("config") + if config_path is None: + config = get_config(**args) + else: + config = load_config(config_path) -def add_training_arguments(parser): - parser.add_argument('--embedding_table_size', dest='embedding_table_size', default=200000, type=int, help='Batch size') - parser.add_argument('--random_seed', dest='random_seed', default=None, type=int, help='') + model = RGGAN - parser.add_argument('--node_emb_size', dest='node_emb_size', default=100, type=int, help='') - parser.add_argument('--elem_emb_size', dest='elem_emb_size', default=100, type=int, help='') - parser.add_argument('--num_per_neigh', dest='num_per_neigh', default=10, type=int, help='') - parser.add_argument('--neg_sampling_factor', dest='neg_sampling_factor', default=3, type=int, help='') + date_time = str(datetime.now()) + print("\n\n") + print(date_time) - parser.add_argument('--use_layer_scheduling', action='store_true') - parser.add_argument('--schedule_layers_every', dest='schedule_layers_every', default=10, type=int, help='') + restore_state = config["TRAINING"]["restore_state"] - parser.add_argument('--epochs', dest='epochs', default=100, type=int, help='Number of epochs') - parser.add_argument('--batch_size', dest='batch_size', default=128, type=int, help='Batch size') + model_attempt = get_name(model, date_time) - parser.add_argument("--h_dim", dest="h_dim", default=None, type=int) - parser.add_argument("--n_layers", dest="n_layers", default=5, type=int) - parser.add_argument("--objectives", dest="objectives", default=None, type=str) - - -def add_scoring_arguments(parser): - parser.add_argument('--measure_ndcg', action='store_true') - parser.add_argument('--dilate_ndcg', dest='dilate_ndcg', default=200, type=int, help='') + model_base = get_model_base(config["TRAINING"], model_attempt) + dataset = read_or_create_gnn_dataset( + args={**config["DATASET"], **config["TOKENIZER"]}, + model_base=model_base, restore_state=restore_state + ) -def add_performance_arguments(parser): - parser.add_argument('--no_checkpoints', dest="save_checkpoints", action='store_false') + if not restore_state: + save_config(config, join(model_base, "config.yaml")) - parser.add_argument('--use_gcn_checkpoint', action='store_true') - parser.add_argument('--use_att_checkpoint', action='store_true') - parser.add_argument('--use_gru_checkpoint', action='store_true') + trainer, scores = training_procedure( + dataset, + model_name=model, + model_params=config["MODEL"], + trainer_params=config["TRAINING"], + tokenizer_path=config["TOKENIZER"]["tokenizer_path"], + model_base_path=model_base + ) + trainer.save_checkpoint(model_base) -def add_train_args(parser): - parser.add_argument( - '--training_mode', '-tr', dest='training_mode', default=None, - help='Selects one of training procedures [multitask]' - ) + print("Saving...") - add_data_arguments(parser) - add_pretraining_arguments(parser) - add_training_arguments(parser) - add_scoring_arguments(parser) - add_performance_arguments(parser) + metadata = { + "base": model_base, + "name": model_attempt, + "layers": "embeddings.pkl", + "mappings": "nodes.csv", + "state": "state_dict.pt", + "scores": scores, + "time": date_time, + } - parser.add_argument('--note', dest='note', default="", help='Note, added to metadata') - parser.add_argument('model_output_dir', help='Location of the final model') + metadata["config"] = args - # parser.add_argument('--intermediate_supervision', action='store_true') - parser.add_argument('--gpu', dest='gpu', default=-1, type=int, help='') + # pickle.dump(dataset, open(join(model_base, "dataset.pkl"), "wb")) + import pickle + pickle.dump(trainer.get_embeddings(), open(join(model_base, metadata['layers']), "wb")) + with open(join(model_base, "metadata.json"), "w") as mdata: + mdata.write(json.dumps(metadata, indent=4)) -def verify_arguments(args): - pass + print("Done saving") if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser(description='Process some integers.') - add_train_args(parser) + parser = argparse.ArgumentParser(description='') + add_gnn_train_args(parser) args = parser.parse_args() verify_arguments(args) logging.basicConfig(level=logging.INFO, format="%(asctime)s:%(levelname)s:%(module)s:%(lineno)d:%(message)s") - models_ = { - # GCNSampling: gcnsampling_params, - # GATSampler: gatsampling_params, - # RGCNSampling: rgcnsampling_params, - # RGAN: rgcnsampling_params, - RGGAN: rggan_params - - } + # models_ = { + # # GCNSampling: gcnsampling_params, + # # GATSampler: gatsampling_params, + # # RGCNSampling: rgcnsampling_params, + # # RGAN: rgcnsampling_params, + # RGGAN: rggan_params + # + # } if not isdir(args.model_output_dir): mkdir(args.model_output_dir) - main(models_, args) + main(args) + + # main(models_, args) diff --git a/scripts/training/train_all_emb_types.sh b/scripts/training/train_all_emb_types.sh index 4ddd0545..1e2dd656 100755 --- a/scripts/training/train_all_emb_types.sh +++ b/scripts/training/train_all_emb_types.sh @@ -1,7 +1,16 @@ +DATASET=$1 +DGLKE_OUT=$2 + +#conda activate SourceCodeTools +#prepare_dglke_format.py $DATASET $DGLKE_OUT +#conda deactivate + conda activate dglke -dglke_train --model_name TransR --dataset code --data_path . --data_files edges_train_dglke.tsv held_dglkg.tsv held_dglkg.tsv --format raw_udd_htr --save_path transr_u --hidden_dim 100 --num_thread 4 --gpu 0 -dglke_train --model_name RESCAL --dataset code --data_path . --data_files edges_train_dglke.tsv held_dglkg.tsv held_dglkg.tsv --format raw_udd_htr --save_path RESCAL_u --hidden_dim 100 --num_thread 4 --gpu 0 --max_step 160000 -dglke_train --model_name DistMult --dataset code --data_path . --data_files edges_train_dglke.tsv held_dglkg.tsv held_dglkg.tsv --format raw_udd_htr --save_path DistMult_u --hidden_dim 100 --num_thread 4 --gpu 0 --max_step 400000 -dglke_train --model_name ComplEx --dataset code --data_path . --data_files edges_train_dglke.tsv held_dglkg.tsv held_dglkg.tsv --format raw_udd_htr --save_path ComplEx_u --hidden_dim 100 --num_thread 4 --gpu 0 --max_step 400000 -dglke_train --model_name RotatE --dataset code --data_path . --data_files edges_train_dglke.tsv held_dglkg.tsv held_dglkg.tsv --format raw_udd_htr --save_path RotatE_u --hidden_dim 100 --num_thread 4 --gpu 0 --max_step 200000 -de + +dglke_train --model_name TransR --dataset code --data_path $DGLKE_OUT --data_files edges_train_dglke.tsv edges_eval_dglke_10000.tsv edges_eval_dglke_10000.tsv --format raw_udd_htr --save_path $DGLKE_OUT/transr_u --hidden_dim 100 --num_thread 4 --gpu 0 --max_step 1000000 -adv --test +dglke_train --model_name RESCAL --dataset code --data_path $DGLKE_OUT --data_files edges_train_dglke.tsv edges_eval_dglke_10000.tsv edges_eval_dglke_10000.tsv --format raw_udd_htr --save_path $DGLKE_OUT/RESCAL_u --hidden_dim 100 --num_thread 4 --gpu 0 --max_step 1000000 -adv --test +dglke_train --model_name DistMult --dataset code --data_path $DGLKE_OUT --data_files edges_train_dglke.tsv edges_eval_dglke_10000.tsv edges_eval_dglke_10000.tsv --format raw_udd_htr --save_path $DGLKE_OUT/DistMult_u --hidden_dim 100 --num_thread 4 --gpu 0 --max_step 1000000 -adv --test +dglke_train --model_name ComplEx --dataset code --data_path $DGLKE_OUT --data_files edges_train_dglke.tsv edges_eval_dglke_10000.tsv edges_eval_dglke_10000.tsv --format raw_udd_htr --save_path $DGLKE_OUT/ComplEx_u --hidden_dim 100 --num_thread 4 --gpu 0 --max_step 1000000 -adv --test +dglke_train --model_name RotatE --dataset code --data_path $DGLKE_OUT --data_files edges_train_dglke.tsv edges_eval_dglke_10000.tsv edges_eval_dglke_10000.tsv --format raw_udd_htr --save_path $DGLKE_OUT/RotatE_u --hidden_dim 100 --num_thread 4 --gpu 0 --max_step 1000000 -de -adv --test + conda deactivate diff --git a/setup.py b/setup.py index 7262d3af..19743dfd 100644 --- a/setup.py +++ b/setup.py @@ -1,51 +1,55 @@ from distutils.core import setup -requitements = [ - 'nltk', - 'tensorflow>=2.4.0', - 'torch>=1.7.1', - 'pandas>=1.1.1', - 'sklearn', - 'sentencepiece', - 'gensim', - 'numpy>=1.19.2', - 'scipy', - 'networkx', - 'sacrebleu', - 'datasets', - 'spacy==2.3.2', - 'pytest' - # 'pygraphviz' - # 'javac_parser' +requirements = [ + 'nltk==3.6', + 'tensorflow==2.6.2', + 'torch==1.9.0', + 'pandas==1.1.1', + 'scikit-learn==1.0', + 'sentencepiece==0.1.96', + 'gensim==3.8', + 'numpy==1.19.5', + 'scipy==1.4.1', + 'networkx==2.5', + 'sacrebleu==1.5.1', + 'datasets==1.5.0', + 'spacy==2.3.2', + 'pytest==6.1.2', + 'faiss-cpu==1.7.0', + 'tqdm==4.49.0' + # 'pygraphviz' + # 'javac_parser' ] - +# conda install pytorch cudatoolkit=11.1 dgl-cuda11.1 -c dglteam -c pytorch -c nvidia setup(name='SourceCodeTools', - version='0.0.2', + version='0.0.3', py_modules=['SourceCodeTools'], - install_requires=requitements + ["dgl"], - extras_require={ - "gpu": requitements + ["dgl-cu110"] - }, + # install_requires=requirements + ["dgl==0.7.1"], + # extras_require={ + # "gpu": requirements + ["dgl-cu111==0.7.1"] + # }, + dependency_links=['https://data.dgl.ai/wheels/repo.html'], scripts=[ - 'SourceCodeTools/code/data/sourcetrail/sourcetrail_call_seq_extractor.py', - 'SourceCodeTools/code/data/sourcetrail/sourcetrail_extract_node_names.py', - 'SourceCodeTools/code/data/sourcetrail/sourcetrail_extract_variable_names.py', - 'SourceCodeTools/code/data/sourcetrail/sourcetrail_filter_ambiguous_edges.py', - 'SourceCodeTools/code/data/sourcetrail/sourcetrail_compute_function_diameter.py', - 'SourceCodeTools/code/data/sourcetrail/sourcetrail_add_reverse_edges.py', - 'SourceCodeTools/code/data/sourcetrail/sourcetrail_ast_edges2.py', - 'SourceCodeTools/code/data/sourcetrail/sourcetrail_merge_graphs.py', - 'SourceCodeTools/code/data/sourcetrail/sourcetrail_parse_bodies2.py', - 'SourceCodeTools/code/data/sourcetrail/sourcetrail_edges_name_resolve.py', - 'SourceCodeTools/code/data/sourcetrail/sourcetrail_node_name_merge.py', - 'SourceCodeTools/code/data/sourcetrail/sourcetrail_decode_edge_types.py', - 'SourceCodeTools/code/data/sourcetrail/sourcetrail_verify_files.py', - 'SourceCodeTools/code/data/sourcetrail/sourcetrail_map_id_columns.py', - 'SourceCodeTools/code/data/sourcetrail/sourcetrail_map_id_columns_only_annotations.py', - 'SourceCodeTools/code/data/sourcetrail/sourcetrail_node_local2global.py', - 'SourceCodeTools/code/data/sourcetrail/sourcetrail_connected_component.py', - 'SourceCodeTools/code/data/sourcetrail/pandas_format_converter.py', - 'SourceCodeTools/code/data/sourcetrail/sourcetrail_create_type_annotation_dataset.py', - 'SourceCodeTools/nlp/embed/converters/convert_fasttext_format_bin_to_vec.py', + 'SourceCodeTools/code/data/sourcetrail/sourcetrail_call_seq_extractor.py', + 'SourceCodeTools/code/data/sourcetrail/sourcetrail_extract_node_names.py', + 'SourceCodeTools/code/data/sourcetrail/sourcetrail_extract_variable_names.py', + 'SourceCodeTools/code/data/sourcetrail/sourcetrail_filter_ambiguous_edges.py', + 'SourceCodeTools/code/data/sourcetrail/sourcetrail_compute_function_diameter.py', + 'SourceCodeTools/code/data/sourcetrail/sourcetrail_add_reverse_edges.py', + 'SourceCodeTools/code/data/sourcetrail/sourcetrail_ast_edges2.py', + 'SourceCodeTools/code/data/sourcetrail/sourcetrail_merge_graphs.py', + 'SourceCodeTools/code/data/sourcetrail/sourcetrail_parse_bodies2.py', + 'SourceCodeTools/code/data/sourcetrail/sourcetrail_edges_name_resolve.py', + 'SourceCodeTools/code/data/sourcetrail/sourcetrail_node_name_merge.py', + 'SourceCodeTools/code/data/sourcetrail/sourcetrail_decode_edge_types.py', + 'SourceCodeTools/code/data/sourcetrail/sourcetrail_verify_files.py', + 'SourceCodeTools/code/data/sourcetrail/sourcetrail_map_id_columns.py', + 'SourceCodeTools/code/data/sourcetrail/sourcetrail_map_id_columns_only_annotations.py', + 'SourceCodeTools/code/data/sourcetrail/sourcetrail_node_local2global.py', + 'SourceCodeTools/code/data/sourcetrail/sourcetrail_connected_component.py', + 'SourceCodeTools/code/data/sourcetrail/pandas_format_converter.py', + # 'SourceCodeTools/code/data/sourcetrail/sourcetrail_create_type_annotation_dataset.py', + 'SourceCodeTools/nlp/embed/converters/convert_fasttext_format_bin_to_vec.py', + 'SourceCodeTools/models/graph/utils/prepare_dglke_format.py', ], -) + ) diff --git a/var_misuse_tiny.yaml b/var_misuse_tiny.yaml new file mode 100644 index 00000000..360486d0 --- /dev/null +++ b/var_misuse_tiny.yaml @@ -0,0 +1,55 @@ +DATASET: + custom_reverse: null + data_path: ./examples/one_vs_10/with_ast + filter_edges: null + min_count_for_objectives: 5 + no_global_edges: false + random_seed: null + remove_reverse: false + restricted_id_pool: null + self_loops: false + subgraph_id_column: file_id + subgraph_partition: ./examples/one_vs_10/with_ast/subgraph_partition.json.bz2 + train_frac: 0.9 + use_edge_types: false + use_node_types: false +MODEL: + activation: tanh + dropout: 0.0 + h_dim: 100 + n_layers: 5 + node_emb_size: 100 + num_bases: 10 + use_att_checkpoint: false + use_gcn_checkpoint: false + use_gru_checkpoint: false + use_self_loop: true +TOKENIZER: + tokenizer_path: sentencepiece_bpe.model +TRAINING: + batch_size: 128 + dilate_scores: 1 + early_stopping: false + early_stopping_tolerance: 20 + elem_emb_size: 100 + embedding_table_size: 200000 + epochs: 100 + external_dataset: null + force_w2v_ns: false + gpu: -1 + learning_rate: 0.001 + measure_scores: true + metric: inner_prod + model_output_dir: ./examples/one_vs_10/with_ast + neg_sampling_factor: 3 + nn_index: brute + objectives: subgraph_clf + pretrained: null + pretraining_phase: 0 + restore_state: false + sampling_neighbourhood_size: 10 + save_checkpoints: true + save_each_epoch: false + schedule_layers_every: 10 + use_layer_scheduling: false + use_ns_groups: false