From 601185a8709c11e9d1c088d1ff3c4b0912cc4724 Mon Sep 17 00:00:00 2001
From: SaulLu <lucilesaul.com@gmail.com>
Date: Mon, 28 Mar 2022 17:47:43 +0200
Subject: [PATCH 01/14] add template extrapolation

---
 train/tr7-alibi/tr7d-extrapolation-law.slurm | 183 +++++++++++++++++++
 1 file changed, 183 insertions(+)
 create mode 100644 train/tr7-alibi/tr7d-extrapolation-law.slurm

diff --git a/train/tr7-alibi/tr7d-extrapolation-law.slurm b/train/tr7-alibi/tr7d-extrapolation-law.slurm
new file mode 100644
index 00000000..69f402b5
--- /dev/null
+++ b/train/tr7-alibi/tr7d-extrapolation-law.slurm
@@ -0,0 +1,183 @@
+#!/bin/bash
+#SBATCH --job-name=350M-alibi-extrapolation
+#SBATCH --qos=qos_gpu-t3
+#SBATCH --nodes=16
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=40         # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --gres=gpu:4                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/alibi/%x-%j.out
+#SBATCH --account=six@gpu
+
+set -x -e
+
+ROUND=2
+TESTING=0
+
+OUTPUT_PATH=$SCRATCH/synched_exps/tr7b-350M-alibi
+MEGATRON_DEEPSPEED_REPO=$SCRATCH/repos/Megatron-DeepSpeed
+
+VOCAB_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-vocab.json
+MERGE_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-merges.txt
+DATA_PATH=$six_ALL_CCFRWORK/datasets-custom/oscar-en/meg-gpt2_text_document
+
+source $six_ALL_CCFRWORK/start-prod
+export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
+export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets
+export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
+export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+cd $MEGATRON_DEEPSPEED_REPO
+
+MASTER_ADDR=$(perl -le '$_=$ENV{"SLURM_JOB_NODELIST"}; s/,.*//; s/-.*//; s/\[//; print')
+MASTER_PORT=6000
+
+# adjust depending on the number of the nodes
+
+# XXX: edit me
+GPUS_PER_NODE=4
+NNODES=4
+PP_SIZE=2                                                 # NLAYERS must be a multiple of PP_SIZE here
+TP_SIZE=4                                                 # always fixed to the size of a single node
+DP_SIZE=$((NNODES * GPUS_PER_NODE / (PP_SIZE * TP_SIZE))) # will get derived automatically by trainer
+
+MICRO_BATCH_SIZE=8
+GLOBAL_BATCH_SIZE=512
+TRAIN_ITER=73_242_187
+
+NLAYERS=24
+NHIDDEN=1024
+NHEADS=16
+FFN_HIDDEN_SIZE=4096
+SEQ_LEN=2048
+
+if [[ ${ROUND} == 1 ]]; then
+    EXIT_INTERVAL=100 SAVE_INTERVAL=10
+elif [[ ${ROUND} == 2 ]]; then
+    SAVE_INTERVAL=1500
+else
+    echo "invalid ROUND: $ROUND"
+fi
+
+OPTIMIZER_ARGS=" \
+    --optimizer adam \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.999 \
+    --adam-eps 1e-8 \
+    --lr 3e-4 \
+    --min-lr 1e-5 \
+    --lr-decay-style cosine \
+    --lr-decay-samples 73_242_187 \
+    --lr-warmup-samples 183_105 \
+    --clip-grad 1.0 \
+    --weight-decay 1e-1 \
+    --no-train 1 \
+    "
+
+EXIT_OPTS=" \
+    --exit-duration-in-mins 1190 \
+    "
+
+for increment in {100..2000..100}; do
+    SEQ_LEN_2=$(($increment + $SEQ_LEN))
+    echo "***** Extrapolation for a seq length of $SEQ_LEN_2 *****"
+
+    GPT_ARGS=" \
+    --num-layers $NLAYERS \
+    --hidden-size $NHIDDEN \
+    --num-attention-heads $NHEADS \
+    --ffn-hidden-size $FFN_HIDDEN_SIZE \
+    --seq-length $SEQ_LEN_2 \
+    --max-position-embeddings $SEQ_LEN_2 \
+    --micro-batch-size $MICRO_BATCH_SIZE \
+    --global-batch-size $GLOBAL_BATCH_SIZE \
+    --rampup-batch-size 32 32 2_000_000 \
+    --train-samples $TRAIN_ITER \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    --loss-scale 12 \
+    --clip-grad 1.0 \
+    --fp16 \
+    --checkpoint-activations \
+    --position-embedding-type alibi \
+    $OPTIMIZER_ARGS \
+    $EXIT_OPTS \
+    "
+
+    OUTPUT_ARGS=" \
+    --log-interval 200 \
+    --save-interval $SAVE_INTERVAL \
+    --eval-interval 1000 \
+    --eval-iters 100 \
+    --tensorboard-dir $OUTPUT_PATH/validation/tensorboard \
+    --tensorboard-queue-size 5 \
+    --log-timers-to-tensorboard \
+    --log-batch-size-to-tensorboard \
+    --log-validation-ppl-to-tensorboard \
+    "
+
+    ZERO_STAGE=1
+
+    config_json="./ds_config.$SLURM_JOBID.json"
+
+    # Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+    cat <<EOT >$config_json
+{
+"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
+"train_batch_size": $GLOBAL_BATCH_SIZE,
+"gradient_clipping": 1.0,
+"zero_optimization": {
+    "stage": $ZERO_STAGE
+},
+"fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+},
+"steps_per_print": 2000,
+"wall_clock_breakdown": false
+}
+EOT
+
+    DEEPSPEED_ARGS=" \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${ZERO_STAGE} \
+    --deepspeed-activation-checkpointing \
+    "
+
+    export LAUNCHER="python -u -m torch.distributed.launch \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT \
+    "
+
+    export CMD=" \
+    $(pwd)/pretrain_gpt.py \
+    --tensor-model-parallel-size $TP_SIZE \
+    --pipeline-model-parallel-size $PP_SIZE \
+    $GPT_ARGS \
+    $OUTPUT_ARGS \
+    --save $OUTPUT_PATH/checkpoints \
+    --load $OUTPUT_PATH/checkpoints \
+    --data-path $DATA_PATH \
+    --data-impl mmap \
+    --split 949,50,1 \
+    --distributed-backend nccl \
+    $DEEPSPEED_ARGS \
+    "
+
+    # # clear old checkpoint as it'd mismatch while we sort things out
+    #     rm -rf $SAVE_CHECKPOINT_PATH
+
+    echo $CMD
+
+    # to debug - add echo (it exits and prints what it would have launched)
+    srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $OUTPUT_PATH/validation/logs/tr7b-350M-alibi-extrapolation.$SLURM_JOBID.out
+done

From 59f98677652039928a37355bb31fe00bd5f3e57a Mon Sep 17 00:00:00 2001
From: SaulLu <lucilesaul.com@gmail.com>
Date: Mon, 28 Mar 2022 18:08:06 +0200
Subject: [PATCH 02/14] save changes

---
 train/tr7-alibi/tr7d-extrapolation-law.slurm | 68 ++++++++++----------
 1 file changed, 35 insertions(+), 33 deletions(-)

diff --git a/train/tr7-alibi/tr7d-extrapolation-law.slurm b/train/tr7-alibi/tr7d-extrapolation-law.slurm
index 69f402b5..b9475585 100644
--- a/train/tr7-alibi/tr7d-extrapolation-law.slurm
+++ b/train/tr7-alibi/tr7d-extrapolation-law.slurm
@@ -12,16 +12,23 @@
 
 set -x -e
 
-ROUND=2
-TESTING=0
 
-OUTPUT_PATH=$SCRATCH/synched_exps/tr7b-350M-alibi
-MEGATRON_DEEPSPEED_REPO=$SCRATCH/repos/Megatron-DeepSpeed
-
-VOCAB_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-vocab.json
-MERGE_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-merges.txt
+# TODO: modify these for your training setup, just Ctrl-F replace <YOUR_TRAINING_NAME>
+DATA_OUTPUT_PATH=$SCRATCH/checkpoints/tr7d-1B3-alibi
+CHECKPOINT_PATH=$DATA_OUTPUT_PATH/checkpoints
+REPO_PATH=$DATA_OUTPUT_PATH/tr7d-1B3-alibi-logs
+TENSORBOARD_PATH=$REPO_PATH/tensorboard
+CODECARBON_PATH=$REPO_PATH/codecarbon
+LOGS_PATH=$REPO_PATH/logs
+VAL_LOGS_PATH=$REPO_PATH/val-logs
+MEGATRON_DEEPSPEED_REPO=$DATA_OUTPUT_PATH/code/Megatron-DeepSpeed
+
+# TODO: you may change the dataset, some examples are at tr3-1B3-baseline (tr3 = c4 + t5-tokenizer, tr3m = the Pile)
+VOCAB_FILE=$DATA_OUTPUT_PATH/data/gpt2-vocab.json
+MERGE_FILE=$DATA_OUTPUT_PATH/data/gpt2-merges.txt
 DATA_PATH=$six_ALL_CCFRWORK/datasets-custom/oscar-en/meg-gpt2_text_document
 
+# defining the right environment variables
 source $six_ALL_CCFRWORK/start-prod
 export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
 export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets
@@ -31,49 +38,42 @@ export HF_DATASETS_OFFLINE=1
 export TRANSFORMERS_OFFLINE=1
 cd $MEGATRON_DEEPSPEED_REPO
 
-MASTER_ADDR=$(perl -le '$_=$ENV{"SLURM_JOB_NODELIST"}; s/,.*//; s/-.*//; s/\[//; print')
+# so processes know who to talk to
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
 MASTER_PORT=6000
 
-# adjust depending on the number of the nodes
-
-# XXX: edit me
+# TODO: this is our base config for 1B3, edit PP/TP/batch size/model config if smaller or bigger
 GPUS_PER_NODE=4
-NNODES=4
-PP_SIZE=2                                                 # NLAYERS must be a multiple of PP_SIZE here
-TP_SIZE=4                                                 # always fixed to the size of a single node
-DP_SIZE=$((NNODES * GPUS_PER_NODE / (PP_SIZE * TP_SIZE))) # will get derived automatically by trainer
+NNODES=16
+PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here
+TP_SIZE=1 # always fixed to the size of a single node
+DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer
 
-MICRO_BATCH_SIZE=8
+MICRO_BATCH_SIZE=1
 GLOBAL_BATCH_SIZE=512
 TRAIN_ITER=73_242_187
 
 NLAYERS=24
-NHIDDEN=1024
+NHIDDEN=2048
 NHEADS=16
-FFN_HIDDEN_SIZE=4096
+FFN_HIDDEN_SIZE=8192
 SEQ_LEN=2048
 
-if [[ ${ROUND} == 1 ]]; then
-    EXIT_INTERVAL=100 SAVE_INTERVAL=10
-elif [[ ${ROUND} == 2 ]]; then
-    SAVE_INTERVAL=1500
-else
-    echo "invalid ROUND: $ROUND"
-fi
+SAVE_INTERVAL=1500
 
 OPTIMIZER_ARGS=" \
     --optimizer adam \
     --adam-beta1 0.9 \
     --adam-beta2 0.999 \
     --adam-eps 1e-8 \
-    --lr 3e-4 \
+    --lr 2e-4 \
     --min-lr 1e-5 \
     --lr-decay-style cosine \
     --lr-decay-samples 73_242_187 \
     --lr-warmup-samples 183_105 \
     --clip-grad 1.0 \
     --weight-decay 1e-1 \
-    --no-train 1 \
+    --eval-only \
     "
 
 EXIT_OPTS=" \
@@ -111,19 +111,21 @@ for increment in {100..2000..100}; do
     --save-interval $SAVE_INTERVAL \
     --eval-interval 1000 \
     --eval-iters 100 \
-    --tensorboard-dir $OUTPUT_PATH/validation/tensorboard \
+    --tensorboard-dir $TENSORBOARD_PATH \
     --tensorboard-queue-size 5 \
     --log-timers-to-tensorboard \
     --log-batch-size-to-tensorboard \
     --log-validation-ppl-to-tensorboard \
     "
+    # TODO: Add --codecarbon-dir $CODECARBON_PATH \ if you want to use codecarbon, not adding it for now to make the current
+    # series of experiments consistent, especially speed-wise. Adding it once Tr6 and Tr7 are done
 
     ZERO_STAGE=1
 
     config_json="./ds_config.$SLURM_JOBID.json"
 
     # Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
-    cat <<EOT >$config_json
+    cat <<EOT > $config_json
 {
 "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
 "train_batch_size": $GLOBAL_BATCH_SIZE,
@@ -159,13 +161,13 @@ EOT
     "
 
     export CMD=" \
-    $(pwd)/pretrain_gpt.py \
+    `pwd`/pretrain_gpt.py \
     --tensor-model-parallel-size $TP_SIZE \
     --pipeline-model-parallel-size $PP_SIZE \
     $GPT_ARGS \
     $OUTPUT_ARGS \
-    --save $OUTPUT_PATH/checkpoints \
-    --load $OUTPUT_PATH/checkpoints \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
     --data-path $DATA_PATH \
     --data-impl mmap \
     --split 949,50,1 \
@@ -179,5 +181,5 @@ EOT
     echo $CMD
 
     # to debug - add echo (it exits and prints what it would have launched)
-    srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $OUTPUT_PATH/validation/logs/tr7b-350M-alibi-extrapolation.$SLURM_JOBID.out
+    srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $VAL_LOGS_PATH/tr7d-1B3-alibi-extrapolation.$SLURM_JOBID.out
 done

From 260a601ab69ea7793ccb98bb5b9fd0c47a5fcc4d Mon Sep 17 00:00:00 2001
From: SaulLu <lucilesaul.com@gmail.com>
Date: Mon, 28 Mar 2022 18:18:43 +0200
Subject: [PATCH 03/14] woups

---
 train/tr7-alibi/tr7d-extrapolation-law.slurm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/train/tr7-alibi/tr7d-extrapolation-law.slurm b/train/tr7-alibi/tr7d-extrapolation-law.slurm
index b9475585..3ba5b837 100644
--- a/train/tr7-alibi/tr7d-extrapolation-law.slurm
+++ b/train/tr7-alibi/tr7d-extrapolation-law.slurm
@@ -1,5 +1,5 @@
 #!/bin/bash
-#SBATCH --job-name=350M-alibi-extrapolation
+#SBATCH --job-name=1B3-alibi-extrapolation
 #SBATCH --qos=qos_gpu-t3
 #SBATCH --nodes=16
 #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
@@ -73,7 +73,7 @@ OPTIMIZER_ARGS=" \
     --lr-warmup-samples 183_105 \
     --clip-grad 1.0 \
     --weight-decay 1e-1 \
-    --eval-only \
+    --eval-only 1\
     "
 
 EXIT_OPTS=" \

From d080ed1775ad0557c5b342ed31b351fde1d1b30e Mon Sep 17 00:00:00 2001
From: SaulLu <lucilesaul.com@gmail.com>
Date: Tue, 29 Mar 2022 09:54:48 +0200
Subject: [PATCH 04/14] add tests

---
 .../tr7-alibi/tr7d-extrapolation-law-2.slurm  | 287 ++++++++++++++++++
 1 file changed, 287 insertions(+)
 create mode 100644 train/tr7-alibi/tr7d-extrapolation-law-2.slurm

diff --git a/train/tr7-alibi/tr7d-extrapolation-law-2.slurm b/train/tr7-alibi/tr7d-extrapolation-law-2.slurm
new file mode 100644
index 00000000..0a3da78a
--- /dev/null
+++ b/train/tr7-alibi/tr7d-extrapolation-law-2.slurm
@@ -0,0 +1,287 @@
+#!/bin/bash
+#SBATCH --job-name=1B3-alibi-extrapolation-2
+#SBATCH --qos=qos_gpu-t3
+#SBATCH --nodes=16
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=40         # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --gres=gpu:4                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/alibi/%x-%j.out
+#SBATCH --account=six@gpu
+
+set -x -e
+
+
+# TODO: modify these for your training setup, just Ctrl-F replace <YOUR_TRAINING_NAME>
+DATA_OUTPUT_PATH=$SCRATCH/checkpoints/tr7d-1B3-alibi
+CHECKPOINT_PATH=$DATA_OUTPUT_PATH/checkpoints
+REPO_PATH=$DATA_OUTPUT_PATH/tr7d-1B3-alibi-logs
+TENSORBOARD_PATH=$REPO_PATH/tensorboard
+CODECARBON_PATH=$REPO_PATH/codecarbon
+LOGS_PATH=$REPO_PATH/logs
+VAL_LOGS_PATH=$REPO_PATH/val-logs
+MEGATRON_DEEPSPEED_REPO=$DATA_OUTPUT_PATH/code/Megatron-DeepSpeed
+
+# TODO: you may change the dataset, some examples are at tr3-1B3-baseline (tr3 = c4 + t5-tokenizer, tr3m = the Pile)
+VOCAB_FILE=$DATA_OUTPUT_PATH/data/gpt2-vocab.json
+MERGE_FILE=$DATA_OUTPUT_PATH/data/gpt2-merges.txt
+DATA_PATH=$six_ALL_CCFRWORK/datasets-custom/oscar-en/meg-gpt2_text_document
+
+# defining the right environment variables
+source $six_ALL_CCFRWORK/start-prod
+export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
+export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets
+export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
+export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+cd $MEGATRON_DEEPSPEED_REPO
+
+# so processes know who to talk to
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+
+# TODO: this is our base config for 1B3, edit PP/TP/batch size/model config if smaller or bigger
+GPUS_PER_NODE=4
+NNODES=16
+PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here
+TP_SIZE=1 # always fixed to the size of a single node
+DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=512
+TRAIN_ITER=73_242_187
+
+NLAYERS=24
+NHIDDEN=2048
+NHEADS=16
+FFN_HIDDEN_SIZE=8192
+SEQ_LEN=2048
+
+SAVE_INTERVAL=1500
+
+OPTIMIZER_ARGS=" \
+    --optimizer adam \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.999 \
+    --adam-eps 1e-8 \
+    --lr 2e-4 \
+    --min-lr 1e-5 \
+    --lr-decay-style cosine \
+    --lr-decay-samples 73_242_187 \
+    --lr-warmup-samples 183_105 \
+    --clip-grad 1.0 \
+    --weight-decay 1e-1 \
+    --eval-only 1\
+    "
+
+EXIT_OPTS=" \
+    --exit-duration-in-mins 1190 \
+    "
+increment=0
+SEQ_LEN_2=$(($increment + $SEQ_LEN))
+echo "***** Extrapolation for a seq length of $SEQ_LEN_2 *****"
+
+GPT_ARGS=" \
+--num-layers $NLAYERS \
+--hidden-size $NHIDDEN \
+--num-attention-heads $NHEADS \
+--ffn-hidden-size $FFN_HIDDEN_SIZE \
+--seq-length $SEQ_LEN_2 \
+--max-position-embeddings $SEQ_LEN_2 \
+--micro-batch-size $MICRO_BATCH_SIZE \
+--global-batch-size $GLOBAL_BATCH_SIZE \
+--rampup-batch-size 32 32 2_000_000 \
+--train-samples $TRAIN_ITER \
+--vocab-file $VOCAB_FILE \
+--merge-file $MERGE_FILE \
+--loss-scale 12 \
+--clip-grad 1.0 \
+--fp16 \
+--checkpoint-activations \
+--position-embedding-type alibi \
+$OPTIMIZER_ARGS \
+$EXIT_OPTS \
+"
+
+OUTPUT_ARGS=" \
+--log-interval 200 \
+--save-interval $SAVE_INTERVAL \
+--eval-interval 1000 \
+--eval-iters 100 \
+--tensorboard-dir $TENSORBOARD_PATH \
+--tensorboard-queue-size 5 \
+--log-timers-to-tensorboard \
+--log-batch-size-to-tensorboard \
+--log-validation-ppl-to-tensorboard \
+"
+# TODO: Add --codecarbon-dir $CODECARBON_PATH \ if you want to use codecarbon, not adding it for now to make the current
+# series of experiments consistent, especially speed-wise. Adding it once Tr6 and Tr7 are done
+
+ZERO_STAGE=1
+
+config_json="./ds_config.$SLURM_JOBID.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
+"train_batch_size": $GLOBAL_BATCH_SIZE,
+"gradient_clipping": 1.0,
+"zero_optimization": {
+    "stage": $ZERO_STAGE
+},
+"fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+},
+"steps_per_print": 2000,
+"wall_clock_breakdown": false
+}
+EOT
+
+DEEPSPEED_ARGS=" \
+--deepspeed \
+--deepspeed_config ${config_json} \
+--zero-stage ${ZERO_STAGE} \
+--deepspeed-activation-checkpointing \
+"
+
+export LAUNCHER="python -u -m torch.distributed.launch \
+--nproc_per_node $GPUS_PER_NODE \
+--nnodes $NNODES \
+--master_addr $MASTER_ADDR \
+--master_port $MASTER_PORT \
+"
+
+export CMD=" \
+`pwd`/pretrain_gpt.py \
+--tensor-model-parallel-size $TP_SIZE \
+--pipeline-model-parallel-size $PP_SIZE \
+$GPT_ARGS \
+$OUTPUT_ARGS \
+--save $CHECKPOINT_PATH \
+--load $CHECKPOINT_PATH \
+--data-path $DATA_PATH \
+--data-impl mmap \
+--split 949,50,1 \
+--distributed-backend nccl \
+$DEEPSPEED_ARGS \
+"
+
+# # clear old checkpoint as it'd mismatch while we sort things out
+#     rm -rf $SAVE_CHECKPOINT_PATH
+
+echo $CMD
+
+# to debug - add echo (it exits and prints what it would have launched)
+srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $VAL_LOGS_PATH/tr7d-1B3-alibi-extrapolation-2.$SLURM_JOBID.out
+
+for increment in {2000..3000..100}; do
+    SEQ_LEN_2=$(($increment + $SEQ_LEN))
+    echo "***** Extrapolation for a seq length of $SEQ_LEN_2 *****"
+
+    GPT_ARGS=" \
+    --num-layers $NLAYERS \
+    --hidden-size $NHIDDEN \
+    --num-attention-heads $NHEADS \
+    --ffn-hidden-size $FFN_HIDDEN_SIZE \
+    --seq-length $SEQ_LEN_2 \
+    --max-position-embeddings $SEQ_LEN_2 \
+    --micro-batch-size $MICRO_BATCH_SIZE \
+    --global-batch-size $GLOBAL_BATCH_SIZE \
+    --rampup-batch-size 32 32 2_000_000 \
+    --train-samples $TRAIN_ITER \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    --loss-scale 12 \
+    --clip-grad 1.0 \
+    --fp16 \
+    --checkpoint-activations \
+    --position-embedding-type alibi \
+    $OPTIMIZER_ARGS \
+    $EXIT_OPTS \
+    "
+
+    OUTPUT_ARGS=" \
+    --log-interval 200 \
+    --save-interval $SAVE_INTERVAL \
+    --eval-interval 1000 \
+    --eval-iters 100 \
+    --tensorboard-dir $TENSORBOARD_PATH \
+    --tensorboard-queue-size 5 \
+    --log-timers-to-tensorboard \
+    --log-batch-size-to-tensorboard \
+    --log-validation-ppl-to-tensorboard \
+    "
+    # TODO: Add --codecarbon-dir $CODECARBON_PATH \ if you want to use codecarbon, not adding it for now to make the current
+    # series of experiments consistent, especially speed-wise. Adding it once Tr6 and Tr7 are done
+
+    ZERO_STAGE=1
+
+    config_json="./ds_config.$SLURM_JOBID.json"
+
+    # Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+    cat <<EOT > $config_json
+{
+"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
+"train_batch_size": $GLOBAL_BATCH_SIZE,
+"gradient_clipping": 1.0,
+"zero_optimization": {
+    "stage": $ZERO_STAGE
+},
+"fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+},
+"steps_per_print": 2000,
+"wall_clock_breakdown": false
+}
+EOT
+
+    DEEPSPEED_ARGS=" \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${ZERO_STAGE} \
+    --deepspeed-activation-checkpointing \
+    "
+
+    export LAUNCHER="python -u -m torch.distributed.launch \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT \
+    "
+
+    export CMD=" \
+    `pwd`/pretrain_gpt.py \
+    --tensor-model-parallel-size $TP_SIZE \
+    --pipeline-model-parallel-size $PP_SIZE \
+    $GPT_ARGS \
+    $OUTPUT_ARGS \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --data-path $DATA_PATH \
+    --data-impl mmap \
+    --split 949,50,1 \
+    --distributed-backend nccl \
+    $DEEPSPEED_ARGS \
+    "
+
+    # # clear old checkpoint as it'd mismatch while we sort things out
+    #     rm -rf $SAVE_CHECKPOINT_PATH
+
+    echo $CMD
+
+    # to debug - add echo (it exits and prints what it would have launched)
+    srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $VAL_LOGS_PATH/tr7d-1B3-alibi-extrapolation-2.$SLURM_JOBID.out
+done

From 6002b7ef0c05b478a93aba3bada04d3c305069cc Mon Sep 17 00:00:00 2001
From: SaulLu <lucilesaul.com@gmail.com>
Date: Wed, 13 Apr 2022 12:15:59 +0200
Subject: [PATCH 05/14] add extrapolation law checkpoint 117k alibi

---
 ...7d-extrapolation-law-checkpoint-117k.slurm | 185 ++++++++++++++++++
 1 file changed, 185 insertions(+)
 create mode 100644 train/tr7-alibi/tr7d-extrapolation-law-checkpoint-117k.slurm

diff --git a/train/tr7-alibi/tr7d-extrapolation-law-checkpoint-117k.slurm b/train/tr7-alibi/tr7d-extrapolation-law-checkpoint-117k.slurm
new file mode 100644
index 00000000..bed98ce5
--- /dev/null
+++ b/train/tr7-alibi/tr7d-extrapolation-law-checkpoint-117k.slurm
@@ -0,0 +1,185 @@
+#!/bin/bash
+#SBATCH --job-name=1B3-alibi-extrapolation-checkpoint-117k
+#SBATCH --qos=qos_gpu-t3
+#SBATCH --nodes=16
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=40         # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --gres=gpu:4                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/alibi/%x-%j.out
+#SBATCH --account=six@gpu
+
+set -x -e
+
+
+# TODO: modify these for your training setup, just Ctrl-F replace <YOUR_TRAINING_NAME>
+DATA_OUTPUT_PATH=$SCRATCH/checkpoints/tr7d-1B3-alibi
+CHECKPOINT_PATH=$DATA_OUTPUT_PATH/checkpoints
+REPO_PATH=$DATA_OUTPUT_PATH/tr7d-1B3-alibi-logs
+TENSORBOARD_PATH=$REPO_PATH/tensorboard
+CODECARBON_PATH=$REPO_PATH/codecarbon
+LOGS_PATH=$REPO_PATH/logs
+VAL_LOGS_PATH=$REPO_PATH/val-logs
+MEGATRON_DEEPSPEED_REPO=$DATA_OUTPUT_PATH/code/Megatron-DeepSpeed
+
+# TODO: you may change the dataset, some examples are at tr3-1B3-baseline (tr3 = c4 + t5-tokenizer, tr3m = the Pile)
+VOCAB_FILE=$DATA_OUTPUT_PATH/data/gpt2-vocab.json
+MERGE_FILE=$DATA_OUTPUT_PATH/data/gpt2-merges.txt
+DATA_PATH=$six_ALL_CCFRWORK/datasets-custom/oscar-en/meg-gpt2_text_document
+
+# defining the right environment variables
+source $six_ALL_CCFRWORK/start-prod
+export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
+export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets
+export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
+export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+cd $MEGATRON_DEEPSPEED_REPO
+
+# so processes know who to talk to
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+
+# TODO: this is our base config for 1B3, edit PP/TP/batch size/model config if smaller or bigger
+GPUS_PER_NODE=4
+NNODES=16
+PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here
+TP_SIZE=1 # always fixed to the size of a single node
+DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=512
+TRAIN_ITER=73_242_187
+
+NLAYERS=24
+NHIDDEN=2048
+NHEADS=16
+FFN_HIDDEN_SIZE=8192
+SEQ_LEN=2048
+
+SAVE_INTERVAL=1500
+
+OPTIMIZER_ARGS=" \
+    --optimizer adam \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.999 \
+    --adam-eps 1e-8 \
+    --lr 2e-4 \
+    --min-lr 1e-5 \
+    --lr-decay-style cosine \
+    --lr-decay-samples 73_242_187 \
+    --lr-warmup-samples 183_105 \
+    --clip-grad 1.0 \
+    --weight-decay 1e-1 \
+    --eval-only 1\
+    "
+
+EXIT_OPTS=" \
+    --exit-duration-in-mins 1190 \
+    "
+
+for increment in {0..4000..100}; do
+    SEQ_LEN_2=$(($increment + $SEQ_LEN))
+    echo "***** Extrapolation for a seq length of $SEQ_LEN_2 *****"
+
+    GPT_ARGS=" \
+    --num-layers $NLAYERS \
+    --hidden-size $NHIDDEN \
+    --num-attention-heads $NHEADS \
+    --ffn-hidden-size $FFN_HIDDEN_SIZE \
+    --seq-length $SEQ_LEN_2 \
+    --max-position-embeddings $SEQ_LEN_2 \
+    --micro-batch-size $MICRO_BATCH_SIZE \
+    --global-batch-size $GLOBAL_BATCH_SIZE \
+    --rampup-batch-size 32 32 2_000_000 \
+    --train-samples $TRAIN_ITER \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    --loss-scale 12 \
+    --clip-grad 1.0 \
+    --fp16 \
+    --checkpoint-activations \
+    --position-embedding-type alibi \
+    $OPTIMIZER_ARGS \
+    $EXIT_OPTS \
+    "
+
+    OUTPUT_ARGS=" \
+    --log-interval 200 \
+    --save-interval $SAVE_INTERVAL \
+    --eval-interval 1000 \
+    --eval-iters 100 \
+    --tensorboard-dir $TENSORBOARD_PATH \
+    --tensorboard-queue-size 5 \
+    --log-timers-to-tensorboard \
+    --log-batch-size-to-tensorboard \
+    --log-validation-ppl-to-tensorboard \
+    "
+    # TODO: Add --codecarbon-dir $CODECARBON_PATH \ if you want to use codecarbon, not adding it for now to make the current
+    # series of experiments consistent, especially speed-wise. Adding it once Tr6 and Tr7 are done
+
+    ZERO_STAGE=1
+
+    config_json="./ds_config.$SLURM_JOBID.json"
+
+    # Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+    cat <<EOT > $config_json
+{
+"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
+"train_batch_size": $GLOBAL_BATCH_SIZE,
+"gradient_clipping": 1.0,
+"zero_optimization": {
+    "stage": $ZERO_STAGE
+},
+"fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+},
+"steps_per_print": 2000,
+"wall_clock_breakdown": false
+}
+EOT
+
+    DEEPSPEED_ARGS=" \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${ZERO_STAGE} \
+    --deepspeed-activation-checkpointing \
+    "
+
+    export LAUNCHER="python -u -m torch.distributed.launch \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT \
+    "
+
+    export CMD=" \
+    `pwd`/pretrain_gpt.py \
+    --tensor-model-parallel-size $TP_SIZE \
+    --pipeline-model-parallel-size $PP_SIZE \
+    $GPT_ARGS \
+    $OUTPUT_ARGS \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --data-path $DATA_PATH \
+    --data-impl mmap \
+    --split 949,50,1 \
+    --distributed-backend nccl \
+    $DEEPSPEED_ARGS \
+    "
+
+    # # clear old checkpoint as it'd mismatch while we sort things out
+    #     rm -rf $SAVE_CHECKPOINT_PATH
+
+    echo $CMD
+
+    # to debug - add echo (it exits and prints what it would have launched)
+    srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $VAL_LOGS_PATH/tr7d-1B3-alibi-extrapolation.$SLURM_JOBID.out
+done

From c14c1f555a067f776cf958750461f8728ef9c3dd Mon Sep 17 00:00:00 2001
From: SaulLu <lucilesaul.com@gmail.com>
Date: Wed, 13 Apr 2022 12:27:59 +0200
Subject: [PATCH 06/14] copy training script

---
 ...B3-extrapolation-law-checkpoint-177k.slurm | 182 ++++++++++++++++++
 1 file changed, 182 insertions(+)
 create mode 100644 train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm

diff --git a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
new file mode 100644
index 00000000..4fbe7831
--- /dev/null
+++ b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
@@ -0,0 +1,182 @@
+#!/bin/bash
+#SBATCH --job-name=1B3-rotary-oscar.slurm
+#SBATCH --qos=qos_gpu-t3
+#SBATCH --nodes=16
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=40         # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --gres=gpu:4                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out          # output file name
+#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err           # error file name
+#SBATCH --account=six@gpu
+
+set -x -e
+
+# TODO: modify these for your training setup, just Ctrl-F replace <YOUR_TRAINING_NAME>
+DATA_OUTPUT_PATH=$six_ALL_CCFRSCRATCH/synched_exps/tr4c-1B3-rotary-oscar
+CHECKPOINT_PATH=$DATA_OUTPUT_PATH/checkpoints
+REPO_PATH=$DATA_OUTPUT_PATH/tr4c-1B3-rotary-oscar-logs
+TENSORBOARD_PATH=$REPO_PATH/tensorboard
+CODECARBON_PATH=$REPO_PATH/codecarbon
+LOGS_PATH=$REPO_PATH/logs
+MEGATRON_DEEPSPEED_REPO=$SCRATCH/repos/Megatron-DeepSpeed
+
+# TODO: you may change the dataset, some examples are at tr3-1B3-baseline (tr3 = c4 + t5-tokenizer, tr3m = the Pile)
+VOCAB_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-vocab.json
+MERGE_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-merges.txt
+DATA_PATH=$six_ALL_CCFRWORK/datasets-custom/oscar-en/meg-gpt2_text_document
+
+# defining the right environment variables
+source $six_ALL_CCFRWORK/start-prod
+export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
+export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets
+export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
+export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+cd $MEGATRON_DEEPSPEED_REPO
+
+# so processes know who to talk to
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+
+# TODO: this is our base config for 1B3, edit PP/TP/batch size/model config if smaller or bigger
+GPUS_PER_NODE=4
+NNODES=16
+PP_SIZE=4                                                 # NLAYERS must be a multiple of PP_SIZE here
+TP_SIZE=4                                                 # always fixed to the size of a single node
+DP_SIZE=$((NNODES * GPUS_PER_NODE / (PP_SIZE * TP_SIZE))) # will get derived automatically by trainer
+
+MICRO_BATCH_SIZE=8
+GLOBAL_BATCH_SIZE=512
+TRAIN_ITER=73_242_187
+
+NLAYERS=24
+NHIDDEN=2048
+NHEADS=16
+FFN_HIDDEN_SIZE=8192
+SEQ_LEN=2048
+
+SAVE_INTERVAL=1500
+
+OPTIMIZER_ARGS=" \
+    --optimizer adam \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.999 \
+    --adam-eps 1e-8 \
+    --lr 2e-4 \
+    --min-lr 1e-5 \
+    --lr-decay-style cosine \
+    --lr-decay-samples 73_242_187 \
+    --lr-warmup-samples 183_105 \
+    --clip-grad 1.0 \
+    --weight-decay 1e-1 \
+    "
+
+EXIT_OPTS=" \
+    --exit-duration-in-mins 1190 \
+    "
+
+GPT_ARGS=" \
+    --num-layers $NLAYERS \
+    --hidden-size $NHIDDEN \
+    --num-attention-heads $NHEADS \
+    --ffn-hidden-size $FFN_HIDDEN_SIZE \
+    --seq-length $SEQ_LEN \
+    --position-embedding-type rotary \
+    --micro-batch-size $MICRO_BATCH_SIZE \
+    --global-batch-size $GLOBAL_BATCH_SIZE \
+    --rampup-batch-size 32 32 2_000_000 \
+    --train-samples $TRAIN_ITER \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    --loss-scale 12 \
+    --clip-grad 1.0 \
+    --fp16 \
+    --checkpoint-activations \
+    $OPTIMIZER_ARGS \
+    $EXIT_OPTS \
+    "
+
+OUTPUT_ARGS=" \
+    --log-interval 200 \
+    --save-interval $SAVE_INTERVAL \
+    --eval-interval 1000 \
+    --eval-iters 100 \
+    --tensorboard-dir $TENSORBOARD_PATH \
+    --tensorboard-queue-size 5 \
+    --log-timers-to-tensorboard \
+    --log-batch-size-to-tensorboard \
+    --log-validation-ppl-to-tensorboard \
+    "
+# TODO: Add --codecarbon-dir $CODECARBON_PATH \ if you want to use codecarbon, not adding it for now to make the current
+# series of experiments consistent, especially speed-wise. Adding it once Tr6 and Tr7 are done
+
+ZERO_STAGE=1
+
+config_json="./ds_config.$SLURM_JOBID.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT >$config_json
+{
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
+  "train_batch_size": $GLOBAL_BATCH_SIZE,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+DEEPSPEED_ARGS=" \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${ZERO_STAGE} \
+    --deepspeed-activation-checkpointing \
+    "
+
+export LAUNCHER="python -u -m torch.distributed.launch \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT \
+    "
+
+export CMD=" \
+    $(pwd)/pretrain_gpt.py \
+    --tensor-model-parallel-size $TP_SIZE \
+    --pipeline-model-parallel-size $PP_SIZE \
+    $GPT_ARGS \
+    $OUTPUT_ARGS \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --data-path $DATA_PATH \
+    --data-impl mmap \
+    --split 949,50,1 \
+    --distributed-backend nccl \
+     $DEEPSPEED_ARGS \
+    "
+
+# # clear old checkpoint as it'd mismatch while we sort things out
+#     rm -rf $SAVE_CHECKPOINT_PATH
+
+echo $CMD
+
+# We create the folder where the logs and codecarbon will be stored.
+mkdir -p $LOGS_PATH
+# Uncomment if you use codecarbon
+# mkdir -p $CODECARBON_PATH
+
+# to debug - add echo (it exits and prints what it would have launched)
+srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt

From 8dbc37874e69708e0ed2104f6590c654a3b419a5 Mon Sep 17 00:00:00 2001
From: SaulLu <lucilesaul.com@gmail.com>
Date: Wed, 13 Apr 2022 12:32:39 +0200
Subject: [PATCH 07/14] adapt paths and slurm config

---
 ...r4c-1B3-extrapolation-law-checkpoint-177k.slurm | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
index 4fbe7831..c3aedba7 100644
--- a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
+++ b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
@@ -1,5 +1,5 @@
 #!/bin/bash
-#SBATCH --job-name=1B3-rotary-oscar.slurm
+#SBATCH --job-name=1B3-rotary-extrapolation-checkpoint-117k
 #SBATCH --qos=qos_gpu-t3
 #SBATCH --nodes=16
 #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
@@ -7,24 +7,24 @@
 #SBATCH --hint=nomultithread         # we get physical cores not logical
 #SBATCH --gres=gpu:4                 # number of gpus
 #SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
-#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out          # output file name
-#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err           # error file name
+#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/rotary/%x-%j.out
 #SBATCH --account=six@gpu
 
 set -x -e
 
 # TODO: modify these for your training setup, just Ctrl-F replace <YOUR_TRAINING_NAME>
-DATA_OUTPUT_PATH=$six_ALL_CCFRSCRATCH/synched_exps/tr4c-1B3-rotary-oscar
+DATA_OUTPUT_PATH=$SCRATCH/synched_exps/tr4c-1B3-rotary-oscar
 CHECKPOINT_PATH=$DATA_OUTPUT_PATH/checkpoints
 REPO_PATH=$DATA_OUTPUT_PATH/tr4c-1B3-rotary-oscar-logs
 TENSORBOARD_PATH=$REPO_PATH/tensorboard
 CODECARBON_PATH=$REPO_PATH/codecarbon
 LOGS_PATH=$REPO_PATH/logs
-MEGATRON_DEEPSPEED_REPO=$SCRATCH/repos/Megatron-DeepSpeed
+VAL_LOGS_PATH=$REPO_PATH/val-logs
+MEGATRON_DEEPSPEED_REPO=$SCRATCH/checkpoints/tr7d-1B3-alibi/code/Megatron-DeepSpeed # use code fixed alibi
 
 # TODO: you may change the dataset, some examples are at tr3-1B3-baseline (tr3 = c4 + t5-tokenizer, tr3m = the Pile)
-VOCAB_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-vocab.json
-MERGE_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-merges.txt
+VOCAB_FILE=$SCRATCH/checkpoints/tr7d-1B3-alibi/data/gpt2-vocab.json
+MERGE_FILE=$SCRATCH/checkpoints/tr7d-1B3-alibi/data/gpt2-merges.txt
 DATA_PATH=$six_ALL_CCFRWORK/datasets-custom/oscar-en/meg-gpt2_text_document
 
 # defining the right environment variables

From cd41b7a9eff976c75099f7d43c353b40ed57df31 Mon Sep 17 00:00:00 2001
From: SaulLu <lucilesaul.com@gmail.com>
Date: Wed, 13 Apr 2022 12:34:51 +0200
Subject: [PATCH 08/14] change eval

---
 .../tr4c-1B3-extrapolation-law-checkpoint-177k.slurm | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
index c3aedba7..829d4bcf 100644
--- a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
+++ b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
@@ -72,19 +72,25 @@ OPTIMIZER_ARGS=" \
     --lr-warmup-samples 183_105 \
     --clip-grad 1.0 \
     --weight-decay 1e-1 \
+    --eval-only 1\
     "
 
 EXIT_OPTS=" \
     --exit-duration-in-mins 1190 \
     "
 
-GPT_ARGS=" \
+for increment in {0..4000..100}; do
+    SEQ_LEN_2=$(($increment + $SEQ_LEN))
+    echo "***** Extrapolation for a seq length of $SEQ_LEN_2 *****"
+
+    GPT_ARGS=" \
     --num-layers $NLAYERS \
     --hidden-size $NHIDDEN \
     --num-attention-heads $NHEADS \
     --ffn-hidden-size $FFN_HIDDEN_SIZE \
-    --seq-length $SEQ_LEN \
+    --seq-length $SEQ_LEN_2 \
     --position-embedding-type rotary \
+    --max-position-embeddings $SEQ_LEN_2 \
     --micro-batch-size $MICRO_BATCH_SIZE \
     --global-batch-size $GLOBAL_BATCH_SIZE \
     --rampup-batch-size 32 32 2_000_000 \
@@ -99,7 +105,7 @@ GPT_ARGS=" \
     $EXIT_OPTS \
     "
 
-OUTPUT_ARGS=" \
+    OUTPUT_ARGS=" \
     --log-interval 200 \
     --save-interval $SAVE_INTERVAL \
     --eval-interval 1000 \

From 6b8a26fda9135ffcffc72b7d8a358537b6aa2166 Mon Sep 17 00:00:00 2001
From: SaulLu <lucilesaul.com@gmail.com>
Date: Wed, 13 Apr 2022 12:35:29 +0200
Subject: [PATCH 09/14] indentation

---
 ...B3-extrapolation-law-checkpoint-177k.slurm | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
index 829d4bcf..a33874d8 100644
--- a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
+++ b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
@@ -116,51 +116,51 @@ for increment in {0..4000..100}; do
     --log-batch-size-to-tensorboard \
     --log-validation-ppl-to-tensorboard \
     "
-# TODO: Add --codecarbon-dir $CODECARBON_PATH \ if you want to use codecarbon, not adding it for now to make the current
-# series of experiments consistent, especially speed-wise. Adding it once Tr6 and Tr7 are done
+    # TODO: Add --codecarbon-dir $CODECARBON_PATH \ if you want to use codecarbon, not adding it for now to make the current
+    # series of experiments consistent, especially speed-wise. Adding it once Tr6 and Tr7 are done
 
-ZERO_STAGE=1
+    ZERO_STAGE=1
 
-config_json="./ds_config.$SLURM_JOBID.json"
+    config_json="./ds_config.$SLURM_JOBID.json"
 
-# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
-cat <<EOT >$config_json
+    # Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+    cat <<EOT > $config_json
 {
-  "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
-  "train_batch_size": $GLOBAL_BATCH_SIZE,
-  "gradient_clipping": 1.0,
-  "zero_optimization": {
+"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
+"train_batch_size": $GLOBAL_BATCH_SIZE,
+"gradient_clipping": 1.0,
+"zero_optimization": {
     "stage": $ZERO_STAGE
-  },
-  "fp16": {
+},
+"fp16": {
     "enabled": true,
     "loss_scale": 0,
     "loss_scale_window": 500,
     "hysteresis": 2,
     "min_loss_scale": 1,
     "initial_scale_power": 12
-  },
-  "steps_per_print": 2000,
-  "wall_clock_breakdown": false
+},
+"steps_per_print": 2000,
+"wall_clock_breakdown": false
 }
 EOT
 
-DEEPSPEED_ARGS=" \
+    DEEPSPEED_ARGS=" \
     --deepspeed \
     --deepspeed_config ${config_json} \
     --zero-stage ${ZERO_STAGE} \
     --deepspeed-activation-checkpointing \
     "
 
-export LAUNCHER="python -u -m torch.distributed.launch \
+    export LAUNCHER="python -u -m torch.distributed.launch \
     --nproc_per_node $GPUS_PER_NODE \
     --nnodes $NNODES \
     --master_addr $MASTER_ADDR \
     --master_port $MASTER_PORT \
     "
 
-export CMD=" \
-    $(pwd)/pretrain_gpt.py \
+    export CMD=" \
+    `pwd`/pretrain_gpt.py \
     --tensor-model-parallel-size $TP_SIZE \
     --pipeline-model-parallel-size $PP_SIZE \
     $GPT_ARGS \

From 4b40871f2d58a39133257f85ee773d7be969ecb2 Mon Sep 17 00:00:00 2001
From: SaulLu <lucilesaul.com@gmail.com>
Date: Wed, 13 Apr 2022 12:36:50 +0200
Subject: [PATCH 10/14] last adaptation

---
 ...1B3-extrapolation-law-checkpoint-177k.slurm | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
index a33874d8..655b4ff1 100644
--- a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
+++ b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
@@ -171,18 +171,14 @@ EOT
     --data-impl mmap \
     --split 949,50,1 \
     --distributed-backend nccl \
-     $DEEPSPEED_ARGS \
+    $DEEPSPEED_ARGS \
     "
 
-# # clear old checkpoint as it'd mismatch while we sort things out
-#     rm -rf $SAVE_CHECKPOINT_PATH
+    # # clear old checkpoint as it'd mismatch while we sort things out
+    #     rm -rf $SAVE_CHECKPOINT_PATH
 
-echo $CMD
+    echo $CMD
 
-# We create the folder where the logs and codecarbon will be stored.
-mkdir -p $LOGS_PATH
-# Uncomment if you use codecarbon
-# mkdir -p $CODECARBON_PATH
-
-# to debug - add echo (it exits and prints what it would have launched)
-srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt
+    # to debug - add echo (it exits and prints what it would have launched)
+    srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $VAL_LOGS_PATH/tr4c-1B3-rotary-extrapolation.$SLURM_JOBID.out
+    done

From 734afb9a4e75087bff0cfab18a1830d99f2c5472 Mon Sep 17 00:00:00 2001
From: SaulLu <lucilesaul.com@gmail.com>
Date: Wed, 13 Apr 2022 14:20:15 +0200
Subject: [PATCH 11/14] rotary need max-position-embeddings args set to none

---
 .../tr4c-1B3-extrapolation-law-checkpoint-177k.slurm             | 1 -
 1 file changed, 1 deletion(-)

diff --git a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
index 655b4ff1..345d4de9 100644
--- a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
+++ b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
@@ -90,7 +90,6 @@ for increment in {0..4000..100}; do
     --ffn-hidden-size $FFN_HIDDEN_SIZE \
     --seq-length $SEQ_LEN_2 \
     --position-embedding-type rotary \
-    --max-position-embeddings $SEQ_LEN_2 \
     --micro-batch-size $MICRO_BATCH_SIZE \
     --global-batch-size $GLOBAL_BATCH_SIZE \
     --rampup-batch-size 32 32 2_000_000 \

From e938721124a1a74f96ff9ab4e1f37dcbd9ff260d Mon Sep 17 00:00:00 2001
From: SaulLu <lucilesaul.com@gmail.com>
Date: Wed, 13 Apr 2022 15:17:04 +0200
Subject: [PATCH 12/14] typo

---
 .../tr4c-1B3-extrapolation-law-checkpoint-177k.slurm            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
index 345d4de9..d9d0f3bf 100644
--- a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
+++ b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
@@ -180,4 +180,4 @@ EOT
 
     # to debug - add echo (it exits and prints what it would have launched)
     srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $VAL_LOGS_PATH/tr4c-1B3-rotary-extrapolation.$SLURM_JOBID.out
-    done
+done

From a2acc4b133949e462480d1e377ee267820d4d002 Mon Sep 17 00:00:00 2001
From: SaulLu <lucilesaul.com@gmail.com>
Date: Wed, 13 Apr 2022 17:59:36 +0200
Subject: [PATCH 13/14] try

---
 .../tr4c-1B3-extrapolation-law-checkpoint-177k.slurm            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
index d9d0f3bf..24b7637a 100644
--- a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
+++ b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
@@ -79,7 +79,7 @@ EXIT_OPTS=" \
     --exit-duration-in-mins 1190 \
     "
 
-for increment in {0..4000..100}; do
+for increment in {100..4000..100}; do
     SEQ_LEN_2=$(($increment + $SEQ_LEN))
     echo "***** Extrapolation for a seq length of $SEQ_LEN_2 *****"
 

From de62cfe2ad455328eb3e96c9fa2604f5c0356f6b Mon Sep 17 00:00:00 2001
From: SaulLu <lucilesaul.com@gmail.com>
Date: Thu, 14 Apr 2022 09:54:37 +0200
Subject: [PATCH 14/14] fix

---
 .../tr4c-1B3-extrapolation-law-checkpoint-177k.slurm          | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
index 24b7637a..9fe0b55e 100644
--- a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
+++ b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm
@@ -37,6 +37,8 @@ export HF_DATASETS_OFFLINE=1
 export TRANSFORMERS_OFFLINE=1
 cd $MEGATRON_DEEPSPEED_REPO
 
+mkdir -p $VAL_LOGS_PATH
+
 # so processes know who to talk to
 MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
 MASTER_PORT=6000
@@ -79,7 +81,7 @@ EXIT_OPTS=" \
     --exit-duration-in-mins 1190 \
     "
 
-for increment in {100..4000..100}; do
+for increment in {0..4000..100}; do
     SEQ_LEN_2=$(($increment + $SEQ_LEN))
     echo "***** Extrapolation for a seq length of $SEQ_LEN_2 *****"