jiant-dev · HaokunLiu · Jun 23, 2020 · Jun 23, 2020 · Jun 23, 2020 · Jun 23, 2020
diff --git a/documentation/porting_examples/example5.md b/documentation/porting_examples/example5.md
@@ -0,0 +1,121 @@
+# Transfer example
+
+```bash
+# Set up these paths according to your env
+WORKING_DIR=...    # Choose a working dir (better in scratch)
+NYU_JIANT_DIR=...  # Where you downloaded https://github.com/jiant-dev/nyu-jiant
+
+MODELS_DIR=${WORKING_DIR}/models
+DATA_DIR=${WORKING_DIR}/data
+CACHE_DIR=${WORKING_DIR}/cache
+RUN_CONFIG_DIR=${WORKING_DIR}/run_config_dir
+OUTPUT_DIR=${WORKING_DIR}/output_dir
+MODEL_TYPE=roberta-large
+EXP_NAME=transfer_example
+
+# Download model
+python jiant/scripts/preproc/export_model.py \
+    --model_type ${MODEL_TYPE} \
+    --output_base_path ${MODELS_DIR}/${MODEL_TYPE}
+
+# Move data into location
+# Ping Haokun on slack, if you have access issues
+cp -r /scratch/hl3232/shared/transfer_pilot_data ${WORKING_DIR}/data/data
+
+
+# Prepare data configs
+python ${NYU_JIANT_DIR}/documentation/porting_examples/example5_assets/write_data_configs.py \
+    --output_base_path ${DATA_DIR}/
+
+
+# Tokenize and cache datasets
+for TASK_NAME in mnli ccg squadv1 cosmosqa rte cola boolq wic
+do
+    python jiant/proj/simple/tokenize_and_cache.py \
+        --task_config_path ${DATA_DIR}/configs/${TASK_NAME}.json \
+        --model_type ${MODEL_TYPE} \
+        --model_tokenizer_path ${MODELS_DIR}/${MODEL_TYPE}/tokenizer \
+        --phases train,val \
+        --max_seq_length 256 \
+        --do_iter \
+        --smart_truncate \
+        --output_dir ${CACHE_DIR}/${MODEL_TYPE}/${TASK_NAME}
+done
+
+# Generate run configs
+declare -A TASK_EPOCHS=(
+  ["mnli"]=3
+  ["ccg"]=3
+  ["squadv1"]=3
+  ["cosmosqa"]=3
+  ["rte"]=20
+  ["cola"]=20
+  ["boolq"]=20
+  ["wic"]=20
+)
+declare -A GPUS=(
+  ["mnli"]=1
+  ["ccg"]=1
+  ["squadv1"]=1
+  ["cosmosqa"]=4
+  ["rte"]=1
+  ["cola"]=1
+  ["boolq"]=1
+  ["wic"]=1
+)
+for TASK_NAME in mnli ccg squadv1 cosmosqa rte cola boolq wic
+do
+    python ${NYU_JIANT_DIR}/documentation/porting_examples/example4_assets/make_config.py \
+        --task_config_path ${DATA_DIR}/configs/${TASK_NAME}.json \
+        --task_cache_base_path ${CACHE_DIR}/${MODEL_TYPE}/${TASK_NAME} \
+        --train_batch_size 16 \
+        --epochs ${TASK_EPOCHS[${TASK_NAME}]} \
+        --output_path ${RUN_CONFIG_DIR}/${EXP_NAME}/${TASK_NAME}.json
+done
+
+# Train single task
+for TASK_NAME in mnli ccg squadv1 cosmosqa rte cola boolq wic
+do
+    COMMAND="python \
+        jiant/proj/main/runscript.py \
+        run \
+        --ZZsrc ${MODELS_DIR}/${MODEL_TYPE}/config.json \
+        --jiant_task_container_config_path ${RUN_CONFIG_DIR}/${EXP_NAME}/${TASK_NAME}.json \
+        --model_load_mode from_transformers \
+        --learning_rate 1e-5 \
+        --force_overwrite \
+        --do_train --do_val \
+        --do_save \
+        --eval_every_steps 2000 \
+        --no_improvements_for_n_evals 30 \
+        --save_checkpoint_every_steps 10000 \
+        --output_dir ${OUTPUT_DIR}/${EXP_NAME}/${TASK_NAME}/" sbatch ~/j2_g${GPUS[${TASK_NAME}]}.sbatch
+done
+
+# Train target task from source task
+for SOURCE_TASK in mnli ccg squadv1 cosmosqa
+do
+    for TARGET_TASK in rte cola boolq wic
+    do
+        COMMAND="python \
+            jiant/proj/main/runscript.py \
+            run \
+            --ZZoverrides model_path \
+            --ZZsrc ${MODELS_DIR}/${MODEL_TYPE}/config.json \
+            --jiant_task_container_config_path ${RUN_CONFIG_DIR}/${EXP_NAME}/${TARGET_TASK}.json \
+            --model_load_mode partial \
+            --model_path ${OUTPUT_DIR}/${EXP_NAME}/${SOURCE_TASK}/best_model.p \
+            --learning_rate 1e-5 \
+            --force_overwrite \
+            --do_train --do_val \
+            --do_save \
+            --eval_every_steps 2000 \
+            --no_improvements_for_n_evals 30 \
+            --save_checkpoint_every_steps 10000 \
+            --output_dir ${OUTPUT_DIR}/${EXP_NAME}/${SOURCE_TASK}__${TARGET_TASK}/" sbatch ~/j2_g${GPUS[${TARGET_TASK}]}.sbatch
+    done
+done
+
+
+grep major ${OUTPUT_DIR}/${EXP_NAME}/*/val_metrics.json
+```
diff --git a/documentation/porting_examples/example5_assets/j2_g1.md b/documentation/porting_examples/example5_assets/j2_g1.md
@@ -0,0 +1,34 @@
+# An example of j2_g1.sbatch
+
+Replace [your net id] with your own id
+Create duplications with different gpu numbers gres=gpu:p40:**1**
+Move these to ~
+(if you are working from windows) `dos2unix ~/*.sbatch`
+
+```bash
+#!/bin/bash
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=7
+#SBATCH --gres=gpu:p40:1
+#SBATCH --time=168:00:00
+#SBATCH --mem=30000
+#SBATCH --job-name=p40
+#SBATCH --mail-type=END
+#SBATCH --mail-user=[your net id]@nyu.edu
+#SBATCH --output=/scratch/[your net id]/slurm/%j.out
+module purge
+module load anaconda3/5.3.1
+source activate jiant2
+export PATH=/home/[your net id]/jiant-dev:$PATH # Where you downloaded https://github.com/jiant-dev/jiant
+export PYTHONPATH=/home/[your net id]/jiant-dev:$PYTHONPATH # Where you downloaded https://github.com/jiant-dev/jiant
+export WORKING_DIR=/scratch/[your net id]/j2_files  # Choose a working dir
+export NYU_JIANT_DIR=/home/[your net id]/nyu-jiant  # https://github.com/jiant-dev/nyu-jiant
+export MODELS_DIR=${WORKING_DIR}/models
+export DATA_DIR=${WORKING_DIR}/data
+export CACHE_DIR=${WORKING_DIR}/cache
+export RUN_CONFIG_DIR=${WORKING_DIR}/run_config_dir/
+export OUTPUT_DIR=${WORKING_DIR}/output_dir/
+echo ${COMMAND}
+${COMMAND}
+```
diff --git a/documentation/porting_examples/example5_assets/write_data_configs.py b/documentation/porting_examples/example5_assets/write_data_configs.py
@@ -0,0 +1,103 @@
+import os
+import jiant.utils.python.io as py_io
+import jiant.utils.zconf as zconf
+
+
+def write_data_configs(output_base_path):
+    all_configs = {
+        "mnli": {
+            "task": "mnli",
+            "paths": {
+                "train": "mnli/train.jsonl",
+                "val": "mnli/val.jsonl",
+                "test": "mnli/test.jsonl",
+            },
+            "name": "mnli",
+        },
+        "ccg": {
+            "task": "ccg",
+            "paths": {
+                "train": "ccg/ccg.train",
+                "val": "ccg/ccg.dev",
+                "test": "ccg/ccg.test",
+                "tags_to_id": "ccg/tags_to_id.json",
+            },
+            "name": "ccg",
+        },
+        "squadv1": {
+            "task": "squadv1",
+            "paths": {
+                "train": "squadv1/train-v1.1.json",
+                "val": "squadv1/dev-v1.1.json",
+                "test": "squadv1/dev-v1.1.json",
+            },
+            "name": "squadv1",
+        },
+        "cosmosqa": {
+            "task": "cosmosqa",
+            "paths": {
+                "train": "cosmosqa/train.csv",
+                "val": "cosmosqa/valid.csv",
+                "test": "cosmosqa/test_no_label.csv",
+            },
+            "name": "cosmosqa",
+        },
+        "rte": {
+            "task": "rte",
+            "paths": {
+                "train": "rte/train.jsonl",
+                "val": "rte/val.jsonl",
+                "test": "rte/test.jsonl",
+            },
+            "name": "rte",
+        },
+        "cola": {
+            "task": "cola",
+            "paths": {
+                "train": "cola/train.jsonl",
+                "val": "cola/val.jsonl",
+                "test": "cola/test.jsonl",
+            },
+            "name": "cola",
+        },
+        "boolq": {
+            "task": "boolq",
+            "paths": {
+                "train": "boolq/train.jsonl",
+                "val": "boolq/val.jsonl",
+                "test": "boolq/test.jsonl",
+            },
+            "name": "boolq",
+        },
+        "wic": {
+            "task": "wic",
+            "paths": {
+                "train": "wic/train.jsonl",
+                "val": "wic/val.jsonl",
+                "test": "wic/test.jsonl",
+            },
+            "name": "wic",
+        },
+    }
+    for task_name, config in all_configs.items():
+        for split, data_path in config["paths"].items():
+            config["paths"][split] = os.path.join(output_base_path, "data", data_path)
+
+        py_io.write_json(
+            data=config,
+            path=os.path.join(output_base_path, "configs", f"{task_name}.json"),
+        )
+
+
+@zconf.run_config
+class RunConfiguration(zconf.RunConfig):
+    output_base_path = zconf.attr(type=str)
+
+
+def main():
+    args = RunConfiguration.default_run_cli()
+    write_data_configs(output_base_path=args.output_base_path)
+
+
+if __name__ == "__main__":
+    main()