From 601185a8709c11e9d1c088d1ff3c4b0912cc4724 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Mon, 28 Mar 2022 17:47:43 +0200 Subject: [PATCH 01/14] add template extrapolation --- train/tr7-alibi/tr7d-extrapolation-law.slurm | 183 +++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 train/tr7-alibi/tr7d-extrapolation-law.slurm diff --git a/train/tr7-alibi/tr7d-extrapolation-law.slurm b/train/tr7-alibi/tr7d-extrapolation-law.slurm new file mode 100644 index 00000000..69f402b5 --- /dev/null +++ b/train/tr7-alibi/tr7d-extrapolation-law.slurm @@ -0,0 +1,183 @@ +#!/bin/bash +#SBATCH --job-name=350M-alibi-extrapolation +#SBATCH --qos=qos_gpu-t3 +#SBATCH --nodes=16 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=40 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:4 # number of gpus +#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/alibi/%x-%j.out +#SBATCH --account=six@gpu + +set -x -e + +ROUND=2 +TESTING=0 + +OUTPUT_PATH=$SCRATCH/synched_exps/tr7b-350M-alibi +MEGATRON_DEEPSPEED_REPO=$SCRATCH/repos/Megatron-DeepSpeed + +VOCAB_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-vocab.json +MERGE_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-merges.txt +DATA_PATH=$six_ALL_CCFRWORK/datasets-custom/oscar-en/meg-gpt2_text_document + +source $six_ALL_CCFRWORK/start-prod +export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models +export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets +export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules +export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 +cd $MEGATRON_DEEPSPEED_REPO + +MASTER_ADDR=$(perl -le '$_=$ENV{"SLURM_JOB_NODELIST"}; s/,.*//; s/-.*//; s/\[//; print') +MASTER_PORT=6000 + +# adjust depending on the number of the nodes + +# XXX: edit me +GPUS_PER_NODE=4 +NNODES=4 +PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here +TP_SIZE=4 # always fixed to the size of a single node +DP_SIZE=$((NNODES * GPUS_PER_NODE / (PP_SIZE * TP_SIZE))) # will get derived automatically by trainer + +MICRO_BATCH_SIZE=8 +GLOBAL_BATCH_SIZE=512 +TRAIN_ITER=73_242_187 + +NLAYERS=24 +NHIDDEN=1024 +NHEADS=16 +FFN_HIDDEN_SIZE=4096 +SEQ_LEN=2048 + +if [[ ${ROUND} == 1 ]]; then + EXIT_INTERVAL=100 SAVE_INTERVAL=10 +elif [[ ${ROUND} == 2 ]]; then + SAVE_INTERVAL=1500 +else + echo "invalid ROUND: $ROUND" +fi + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 3e-4 \ + --min-lr 1e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples 73_242_187 \ + --lr-warmup-samples 183_105 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + --no-train 1 \ + " + +EXIT_OPTS=" \ + --exit-duration-in-mins 1190 \ + " + +for increment in {100..2000..100}; do + SEQ_LEN_2=$(($increment + $SEQ_LEN)) + echo "***** Extrapolation for a seq length of $SEQ_LEN_2 *****" + + GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN_2 \ + --max-position-embeddings $SEQ_LEN_2 \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --rampup-batch-size 32 32 2_000_000 \ + --train-samples $TRAIN_ITER \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --loss-scale 12 \ + --clip-grad 1.0 \ + --fp16 \ + --checkpoint-activations \ + --position-embedding-type alibi \ + $OPTIMIZER_ARGS \ + $EXIT_OPTS \ + " + + OUTPUT_ARGS=" \ + --log-interval 200 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1000 \ + --eval-iters 100 \ + --tensorboard-dir $OUTPUT_PATH/validation/tensorboard \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + + ZERO_STAGE=1 + + config_json="./ds_config.$SLURM_JOBID.json" + + # Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() + cat <$config_json +{ +"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, +"train_batch_size": $GLOBAL_BATCH_SIZE, +"gradient_clipping": 1.0, +"zero_optimization": { + "stage": $ZERO_STAGE +}, +"fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 +}, +"steps_per_print": 2000, +"wall_clock_breakdown": false +} +EOT + + DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${ZERO_STAGE} \ + --deepspeed-activation-checkpointing \ + " + + export LAUNCHER="python -u -m torch.distributed.launch \ + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT \ + " + + export CMD=" \ + $(pwd)/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $OUTPUT_PATH/checkpoints \ + --load $OUTPUT_PATH/checkpoints \ + --data-path $DATA_PATH \ + --data-impl mmap \ + --split 949,50,1 \ + --distributed-backend nccl \ + $DEEPSPEED_ARGS \ + " + + # # clear old checkpoint as it'd mismatch while we sort things out + # rm -rf $SAVE_CHECKPOINT_PATH + + echo $CMD + + # to debug - add echo (it exits and prints what it would have launched) + srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $OUTPUT_PATH/validation/logs/tr7b-350M-alibi-extrapolation.$SLURM_JOBID.out +done From 59f98677652039928a37355bb31fe00bd5f3e57a Mon Sep 17 00:00:00 2001 From: SaulLu Date: Mon, 28 Mar 2022 18:08:06 +0200 Subject: [PATCH 02/14] save changes --- train/tr7-alibi/tr7d-extrapolation-law.slurm | 68 ++++++++++---------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/train/tr7-alibi/tr7d-extrapolation-law.slurm b/train/tr7-alibi/tr7d-extrapolation-law.slurm index 69f402b5..b9475585 100644 --- a/train/tr7-alibi/tr7d-extrapolation-law.slurm +++ b/train/tr7-alibi/tr7d-extrapolation-law.slurm @@ -12,16 +12,23 @@ set -x -e -ROUND=2 -TESTING=0 -OUTPUT_PATH=$SCRATCH/synched_exps/tr7b-350M-alibi -MEGATRON_DEEPSPEED_REPO=$SCRATCH/repos/Megatron-DeepSpeed - -VOCAB_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-vocab.json -MERGE_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-merges.txt +# TODO: modify these for your training setup, just Ctrl-F replace +DATA_OUTPUT_PATH=$SCRATCH/checkpoints/tr7d-1B3-alibi +CHECKPOINT_PATH=$DATA_OUTPUT_PATH/checkpoints +REPO_PATH=$DATA_OUTPUT_PATH/tr7d-1B3-alibi-logs +TENSORBOARD_PATH=$REPO_PATH/tensorboard +CODECARBON_PATH=$REPO_PATH/codecarbon +LOGS_PATH=$REPO_PATH/logs +VAL_LOGS_PATH=$REPO_PATH/val-logs +MEGATRON_DEEPSPEED_REPO=$DATA_OUTPUT_PATH/code/Megatron-DeepSpeed + +# TODO: you may change the dataset, some examples are at tr3-1B3-baseline (tr3 = c4 + t5-tokenizer, tr3m = the Pile) +VOCAB_FILE=$DATA_OUTPUT_PATH/data/gpt2-vocab.json +MERGE_FILE=$DATA_OUTPUT_PATH/data/gpt2-merges.txt DATA_PATH=$six_ALL_CCFRWORK/datasets-custom/oscar-en/meg-gpt2_text_document +# defining the right environment variables source $six_ALL_CCFRWORK/start-prod export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets @@ -31,49 +38,42 @@ export HF_DATASETS_OFFLINE=1 export TRANSFORMERS_OFFLINE=1 cd $MEGATRON_DEEPSPEED_REPO -MASTER_ADDR=$(perl -le '$_=$ENV{"SLURM_JOB_NODELIST"}; s/,.*//; s/-.*//; s/\[//; print') +# so processes know who to talk to +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) MASTER_PORT=6000 -# adjust depending on the number of the nodes - -# XXX: edit me +# TODO: this is our base config for 1B3, edit PP/TP/batch size/model config if smaller or bigger GPUS_PER_NODE=4 -NNODES=4 -PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here -TP_SIZE=4 # always fixed to the size of a single node -DP_SIZE=$((NNODES * GPUS_PER_NODE / (PP_SIZE * TP_SIZE))) # will get derived automatically by trainer +NNODES=16 +PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here +TP_SIZE=1 # always fixed to the size of a single node +DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer -MICRO_BATCH_SIZE=8 +MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=512 TRAIN_ITER=73_242_187 NLAYERS=24 -NHIDDEN=1024 +NHIDDEN=2048 NHEADS=16 -FFN_HIDDEN_SIZE=4096 +FFN_HIDDEN_SIZE=8192 SEQ_LEN=2048 -if [[ ${ROUND} == 1 ]]; then - EXIT_INTERVAL=100 SAVE_INTERVAL=10 -elif [[ ${ROUND} == 2 ]]; then - SAVE_INTERVAL=1500 -else - echo "invalid ROUND: $ROUND" -fi +SAVE_INTERVAL=1500 OPTIMIZER_ARGS=" \ --optimizer adam \ --adam-beta1 0.9 \ --adam-beta2 0.999 \ --adam-eps 1e-8 \ - --lr 3e-4 \ + --lr 2e-4 \ --min-lr 1e-5 \ --lr-decay-style cosine \ --lr-decay-samples 73_242_187 \ --lr-warmup-samples 183_105 \ --clip-grad 1.0 \ --weight-decay 1e-1 \ - --no-train 1 \ + --eval-only \ " EXIT_OPTS=" \ @@ -111,19 +111,21 @@ for increment in {100..2000..100}; do --save-interval $SAVE_INTERVAL \ --eval-interval 1000 \ --eval-iters 100 \ - --tensorboard-dir $OUTPUT_PATH/validation/tensorboard \ + --tensorboard-dir $TENSORBOARD_PATH \ --tensorboard-queue-size 5 \ --log-timers-to-tensorboard \ --log-batch-size-to-tensorboard \ --log-validation-ppl-to-tensorboard \ " + # TODO: Add --codecarbon-dir $CODECARBON_PATH \ if you want to use codecarbon, not adding it for now to make the current + # series of experiments consistent, especially speed-wise. Adding it once Tr6 and Tr7 are done ZERO_STAGE=1 config_json="./ds_config.$SLURM_JOBID.json" # Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() - cat <$config_json + cat < $config_json { "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, "train_batch_size": $GLOBAL_BATCH_SIZE, @@ -159,13 +161,13 @@ EOT " export CMD=" \ - $(pwd)/pretrain_gpt.py \ + `pwd`/pretrain_gpt.py \ --tensor-model-parallel-size $TP_SIZE \ --pipeline-model-parallel-size $PP_SIZE \ $GPT_ARGS \ $OUTPUT_ARGS \ - --save $OUTPUT_PATH/checkpoints \ - --load $OUTPUT_PATH/checkpoints \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ --data-path $DATA_PATH \ --data-impl mmap \ --split 949,50,1 \ @@ -179,5 +181,5 @@ EOT echo $CMD # to debug - add echo (it exits and prints what it would have launched) - srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $OUTPUT_PATH/validation/logs/tr7b-350M-alibi-extrapolation.$SLURM_JOBID.out + srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $VAL_LOGS_PATH/tr7d-1B3-alibi-extrapolation.$SLURM_JOBID.out done From 260a601ab69ea7793ccb98bb5b9fd0c47a5fcc4d Mon Sep 17 00:00:00 2001 From: SaulLu Date: Mon, 28 Mar 2022 18:18:43 +0200 Subject: [PATCH 03/14] woups --- train/tr7-alibi/tr7d-extrapolation-law.slurm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/train/tr7-alibi/tr7d-extrapolation-law.slurm b/train/tr7-alibi/tr7d-extrapolation-law.slurm index b9475585..3ba5b837 100644 --- a/train/tr7-alibi/tr7d-extrapolation-law.slurm +++ b/train/tr7-alibi/tr7d-extrapolation-law.slurm @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --job-name=350M-alibi-extrapolation +#SBATCH --job-name=1B3-alibi-extrapolation #SBATCH --qos=qos_gpu-t3 #SBATCH --nodes=16 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! @@ -73,7 +73,7 @@ OPTIMIZER_ARGS=" \ --lr-warmup-samples 183_105 \ --clip-grad 1.0 \ --weight-decay 1e-1 \ - --eval-only \ + --eval-only 1\ " EXIT_OPTS=" \ From d080ed1775ad0557c5b342ed31b351fde1d1b30e Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 29 Mar 2022 09:54:48 +0200 Subject: [PATCH 04/14] add tests --- .../tr7-alibi/tr7d-extrapolation-law-2.slurm | 287 ++++++++++++++++++ 1 file changed, 287 insertions(+) create mode 100644 train/tr7-alibi/tr7d-extrapolation-law-2.slurm diff --git a/train/tr7-alibi/tr7d-extrapolation-law-2.slurm b/train/tr7-alibi/tr7d-extrapolation-law-2.slurm new file mode 100644 index 00000000..0a3da78a --- /dev/null +++ b/train/tr7-alibi/tr7d-extrapolation-law-2.slurm @@ -0,0 +1,287 @@ +#!/bin/bash +#SBATCH --job-name=1B3-alibi-extrapolation-2 +#SBATCH --qos=qos_gpu-t3 +#SBATCH --nodes=16 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=40 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:4 # number of gpus +#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/alibi/%x-%j.out +#SBATCH --account=six@gpu + +set -x -e + + +# TODO: modify these for your training setup, just Ctrl-F replace +DATA_OUTPUT_PATH=$SCRATCH/checkpoints/tr7d-1B3-alibi +CHECKPOINT_PATH=$DATA_OUTPUT_PATH/checkpoints +REPO_PATH=$DATA_OUTPUT_PATH/tr7d-1B3-alibi-logs +TENSORBOARD_PATH=$REPO_PATH/tensorboard +CODECARBON_PATH=$REPO_PATH/codecarbon +LOGS_PATH=$REPO_PATH/logs +VAL_LOGS_PATH=$REPO_PATH/val-logs +MEGATRON_DEEPSPEED_REPO=$DATA_OUTPUT_PATH/code/Megatron-DeepSpeed + +# TODO: you may change the dataset, some examples are at tr3-1B3-baseline (tr3 = c4 + t5-tokenizer, tr3m = the Pile) +VOCAB_FILE=$DATA_OUTPUT_PATH/data/gpt2-vocab.json +MERGE_FILE=$DATA_OUTPUT_PATH/data/gpt2-merges.txt +DATA_PATH=$six_ALL_CCFRWORK/datasets-custom/oscar-en/meg-gpt2_text_document + +# defining the right environment variables +source $six_ALL_CCFRWORK/start-prod +export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models +export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets +export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules +export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 +cd $MEGATRON_DEEPSPEED_REPO + +# so processes know who to talk to +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +MASTER_PORT=6000 + +# TODO: this is our base config for 1B3, edit PP/TP/batch size/model config if smaller or bigger +GPUS_PER_NODE=4 +NNODES=16 +PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here +TP_SIZE=1 # always fixed to the size of a single node +DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=512 +TRAIN_ITER=73_242_187 + +NLAYERS=24 +NHIDDEN=2048 +NHEADS=16 +FFN_HIDDEN_SIZE=8192 +SEQ_LEN=2048 + +SAVE_INTERVAL=1500 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 1e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples 73_242_187 \ + --lr-warmup-samples 183_105 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + --eval-only 1\ + " + +EXIT_OPTS=" \ + --exit-duration-in-mins 1190 \ + " +increment=0 +SEQ_LEN_2=$(($increment + $SEQ_LEN)) +echo "***** Extrapolation for a seq length of $SEQ_LEN_2 *****" + +GPT_ARGS=" \ +--num-layers $NLAYERS \ +--hidden-size $NHIDDEN \ +--num-attention-heads $NHEADS \ +--ffn-hidden-size $FFN_HIDDEN_SIZE \ +--seq-length $SEQ_LEN_2 \ +--max-position-embeddings $SEQ_LEN_2 \ +--micro-batch-size $MICRO_BATCH_SIZE \ +--global-batch-size $GLOBAL_BATCH_SIZE \ +--rampup-batch-size 32 32 2_000_000 \ +--train-samples $TRAIN_ITER \ +--vocab-file $VOCAB_FILE \ +--merge-file $MERGE_FILE \ +--loss-scale 12 \ +--clip-grad 1.0 \ +--fp16 \ +--checkpoint-activations \ +--position-embedding-type alibi \ +$OPTIMIZER_ARGS \ +$EXIT_OPTS \ +" + +OUTPUT_ARGS=" \ +--log-interval 200 \ +--save-interval $SAVE_INTERVAL \ +--eval-interval 1000 \ +--eval-iters 100 \ +--tensorboard-dir $TENSORBOARD_PATH \ +--tensorboard-queue-size 5 \ +--log-timers-to-tensorboard \ +--log-batch-size-to-tensorboard \ +--log-validation-ppl-to-tensorboard \ +" +# TODO: Add --codecarbon-dir $CODECARBON_PATH \ if you want to use codecarbon, not adding it for now to make the current +# series of experiments consistent, especially speed-wise. Adding it once Tr6 and Tr7 are done + +ZERO_STAGE=1 + +config_json="./ds_config.$SLURM_JOBID.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ +"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, +"train_batch_size": $GLOBAL_BATCH_SIZE, +"gradient_clipping": 1.0, +"zero_optimization": { + "stage": $ZERO_STAGE +}, +"fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 +}, +"steps_per_print": 2000, +"wall_clock_breakdown": false +} +EOT + +DEEPSPEED_ARGS=" \ +--deepspeed \ +--deepspeed_config ${config_json} \ +--zero-stage ${ZERO_STAGE} \ +--deepspeed-activation-checkpointing \ +" + +export LAUNCHER="python -u -m torch.distributed.launch \ +--nproc_per_node $GPUS_PER_NODE \ +--nnodes $NNODES \ +--master_addr $MASTER_ADDR \ +--master_port $MASTER_PORT \ +" + +export CMD=" \ +`pwd`/pretrain_gpt.py \ +--tensor-model-parallel-size $TP_SIZE \ +--pipeline-model-parallel-size $PP_SIZE \ +$GPT_ARGS \ +$OUTPUT_ARGS \ +--save $CHECKPOINT_PATH \ +--load $CHECKPOINT_PATH \ +--data-path $DATA_PATH \ +--data-impl mmap \ +--split 949,50,1 \ +--distributed-backend nccl \ +$DEEPSPEED_ARGS \ +" + +# # clear old checkpoint as it'd mismatch while we sort things out +# rm -rf $SAVE_CHECKPOINT_PATH + +echo $CMD + +# to debug - add echo (it exits and prints what it would have launched) +srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $VAL_LOGS_PATH/tr7d-1B3-alibi-extrapolation-2.$SLURM_JOBID.out + +for increment in {2000..3000..100}; do + SEQ_LEN_2=$(($increment + $SEQ_LEN)) + echo "***** Extrapolation for a seq length of $SEQ_LEN_2 *****" + + GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN_2 \ + --max-position-embeddings $SEQ_LEN_2 \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --rampup-batch-size 32 32 2_000_000 \ + --train-samples $TRAIN_ITER \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --loss-scale 12 \ + --clip-grad 1.0 \ + --fp16 \ + --checkpoint-activations \ + --position-embedding-type alibi \ + $OPTIMIZER_ARGS \ + $EXIT_OPTS \ + " + + OUTPUT_ARGS=" \ + --log-interval 200 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1000 \ + --eval-iters 100 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + # TODO: Add --codecarbon-dir $CODECARBON_PATH \ if you want to use codecarbon, not adding it for now to make the current + # series of experiments consistent, especially speed-wise. Adding it once Tr6 and Tr7 are done + + ZERO_STAGE=1 + + config_json="./ds_config.$SLURM_JOBID.json" + + # Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() + cat < $config_json +{ +"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, +"train_batch_size": $GLOBAL_BATCH_SIZE, +"gradient_clipping": 1.0, +"zero_optimization": { + "stage": $ZERO_STAGE +}, +"fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 +}, +"steps_per_print": 2000, +"wall_clock_breakdown": false +} +EOT + + DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${ZERO_STAGE} \ + --deepspeed-activation-checkpointing \ + " + + export LAUNCHER="python -u -m torch.distributed.launch \ + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT \ + " + + export CMD=" \ + `pwd`/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --data-impl mmap \ + --split 949,50,1 \ + --distributed-backend nccl \ + $DEEPSPEED_ARGS \ + " + + # # clear old checkpoint as it'd mismatch while we sort things out + # rm -rf $SAVE_CHECKPOINT_PATH + + echo $CMD + + # to debug - add echo (it exits and prints what it would have launched) + srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $VAL_LOGS_PATH/tr7d-1B3-alibi-extrapolation-2.$SLURM_JOBID.out +done From 6002b7ef0c05b478a93aba3bada04d3c305069cc Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 13 Apr 2022 12:15:59 +0200 Subject: [PATCH 05/14] add extrapolation law checkpoint 117k alibi --- ...7d-extrapolation-law-checkpoint-117k.slurm | 185 ++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 train/tr7-alibi/tr7d-extrapolation-law-checkpoint-117k.slurm diff --git a/train/tr7-alibi/tr7d-extrapolation-law-checkpoint-117k.slurm b/train/tr7-alibi/tr7d-extrapolation-law-checkpoint-117k.slurm new file mode 100644 index 00000000..bed98ce5 --- /dev/null +++ b/train/tr7-alibi/tr7d-extrapolation-law-checkpoint-117k.slurm @@ -0,0 +1,185 @@ +#!/bin/bash +#SBATCH --job-name=1B3-alibi-extrapolation-checkpoint-117k +#SBATCH --qos=qos_gpu-t3 +#SBATCH --nodes=16 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=40 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:4 # number of gpus +#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/alibi/%x-%j.out +#SBATCH --account=six@gpu + +set -x -e + + +# TODO: modify these for your training setup, just Ctrl-F replace +DATA_OUTPUT_PATH=$SCRATCH/checkpoints/tr7d-1B3-alibi +CHECKPOINT_PATH=$DATA_OUTPUT_PATH/checkpoints +REPO_PATH=$DATA_OUTPUT_PATH/tr7d-1B3-alibi-logs +TENSORBOARD_PATH=$REPO_PATH/tensorboard +CODECARBON_PATH=$REPO_PATH/codecarbon +LOGS_PATH=$REPO_PATH/logs +VAL_LOGS_PATH=$REPO_PATH/val-logs +MEGATRON_DEEPSPEED_REPO=$DATA_OUTPUT_PATH/code/Megatron-DeepSpeed + +# TODO: you may change the dataset, some examples are at tr3-1B3-baseline (tr3 = c4 + t5-tokenizer, tr3m = the Pile) +VOCAB_FILE=$DATA_OUTPUT_PATH/data/gpt2-vocab.json +MERGE_FILE=$DATA_OUTPUT_PATH/data/gpt2-merges.txt +DATA_PATH=$six_ALL_CCFRWORK/datasets-custom/oscar-en/meg-gpt2_text_document + +# defining the right environment variables +source $six_ALL_CCFRWORK/start-prod +export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models +export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets +export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules +export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 +cd $MEGATRON_DEEPSPEED_REPO + +# so processes know who to talk to +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +MASTER_PORT=6000 + +# TODO: this is our base config for 1B3, edit PP/TP/batch size/model config if smaller or bigger +GPUS_PER_NODE=4 +NNODES=16 +PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here +TP_SIZE=1 # always fixed to the size of a single node +DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=512 +TRAIN_ITER=73_242_187 + +NLAYERS=24 +NHIDDEN=2048 +NHEADS=16 +FFN_HIDDEN_SIZE=8192 +SEQ_LEN=2048 + +SAVE_INTERVAL=1500 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 1e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples 73_242_187 \ + --lr-warmup-samples 183_105 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + --eval-only 1\ + " + +EXIT_OPTS=" \ + --exit-duration-in-mins 1190 \ + " + +for increment in {0..4000..100}; do + SEQ_LEN_2=$(($increment + $SEQ_LEN)) + echo "***** Extrapolation for a seq length of $SEQ_LEN_2 *****" + + GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN_2 \ + --max-position-embeddings $SEQ_LEN_2 \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --rampup-batch-size 32 32 2_000_000 \ + --train-samples $TRAIN_ITER \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --loss-scale 12 \ + --clip-grad 1.0 \ + --fp16 \ + --checkpoint-activations \ + --position-embedding-type alibi \ + $OPTIMIZER_ARGS \ + $EXIT_OPTS \ + " + + OUTPUT_ARGS=" \ + --log-interval 200 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1000 \ + --eval-iters 100 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + # TODO: Add --codecarbon-dir $CODECARBON_PATH \ if you want to use codecarbon, not adding it for now to make the current + # series of experiments consistent, especially speed-wise. Adding it once Tr6 and Tr7 are done + + ZERO_STAGE=1 + + config_json="./ds_config.$SLURM_JOBID.json" + + # Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() + cat < $config_json +{ +"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, +"train_batch_size": $GLOBAL_BATCH_SIZE, +"gradient_clipping": 1.0, +"zero_optimization": { + "stage": $ZERO_STAGE +}, +"fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 +}, +"steps_per_print": 2000, +"wall_clock_breakdown": false +} +EOT + + DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${ZERO_STAGE} \ + --deepspeed-activation-checkpointing \ + " + + export LAUNCHER="python -u -m torch.distributed.launch \ + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT \ + " + + export CMD=" \ + `pwd`/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --data-impl mmap \ + --split 949,50,1 \ + --distributed-backend nccl \ + $DEEPSPEED_ARGS \ + " + + # # clear old checkpoint as it'd mismatch while we sort things out + # rm -rf $SAVE_CHECKPOINT_PATH + + echo $CMD + + # to debug - add echo (it exits and prints what it would have launched) + srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $VAL_LOGS_PATH/tr7d-1B3-alibi-extrapolation.$SLURM_JOBID.out +done From c14c1f555a067f776cf958750461f8728ef9c3dd Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 13 Apr 2022 12:27:59 +0200 Subject: [PATCH 06/14] copy training script --- ...B3-extrapolation-law-checkpoint-177k.slurm | 182 ++++++++++++++++++ 1 file changed, 182 insertions(+) create mode 100644 train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm diff --git a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm new file mode 100644 index 00000000..4fbe7831 --- /dev/null +++ b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm @@ -0,0 +1,182 @@ +#!/bin/bash +#SBATCH --job-name=1B3-rotary-oscar.slurm +#SBATCH --qos=qos_gpu-t3 +#SBATCH --nodes=16 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=40 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:4 # number of gpus +#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu + +set -x -e + +# TODO: modify these for your training setup, just Ctrl-F replace +DATA_OUTPUT_PATH=$six_ALL_CCFRSCRATCH/synched_exps/tr4c-1B3-rotary-oscar +CHECKPOINT_PATH=$DATA_OUTPUT_PATH/checkpoints +REPO_PATH=$DATA_OUTPUT_PATH/tr4c-1B3-rotary-oscar-logs +TENSORBOARD_PATH=$REPO_PATH/tensorboard +CODECARBON_PATH=$REPO_PATH/codecarbon +LOGS_PATH=$REPO_PATH/logs +MEGATRON_DEEPSPEED_REPO=$SCRATCH/repos/Megatron-DeepSpeed + +# TODO: you may change the dataset, some examples are at tr3-1B3-baseline (tr3 = c4 + t5-tokenizer, tr3m = the Pile) +VOCAB_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-vocab.json +MERGE_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-merges.txt +DATA_PATH=$six_ALL_CCFRWORK/datasets-custom/oscar-en/meg-gpt2_text_document + +# defining the right environment variables +source $six_ALL_CCFRWORK/start-prod +export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models +export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets +export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules +export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 +cd $MEGATRON_DEEPSPEED_REPO + +# so processes know who to talk to +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +MASTER_PORT=6000 + +# TODO: this is our base config for 1B3, edit PP/TP/batch size/model config if smaller or bigger +GPUS_PER_NODE=4 +NNODES=16 +PP_SIZE=4 # NLAYERS must be a multiple of PP_SIZE here +TP_SIZE=4 # always fixed to the size of a single node +DP_SIZE=$((NNODES * GPUS_PER_NODE / (PP_SIZE * TP_SIZE))) # will get derived automatically by trainer + +MICRO_BATCH_SIZE=8 +GLOBAL_BATCH_SIZE=512 +TRAIN_ITER=73_242_187 + +NLAYERS=24 +NHIDDEN=2048 +NHEADS=16 +FFN_HIDDEN_SIZE=8192 +SEQ_LEN=2048 + +SAVE_INTERVAL=1500 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 1e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples 73_242_187 \ + --lr-warmup-samples 183_105 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +EXIT_OPTS=" \ + --exit-duration-in-mins 1190 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --position-embedding-type rotary \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --rampup-batch-size 32 32 2_000_000 \ + --train-samples $TRAIN_ITER \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --loss-scale 12 \ + --clip-grad 1.0 \ + --fp16 \ + --checkpoint-activations \ + $OPTIMIZER_ARGS \ + $EXIT_OPTS \ + " + +OUTPUT_ARGS=" \ + --log-interval 200 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1000 \ + --eval-iters 100 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " +# TODO: Add --codecarbon-dir $CODECARBON_PATH \ if you want to use codecarbon, not adding it for now to make the current +# series of experiments consistent, especially speed-wise. Adding it once Tr6 and Tr7 are done + +ZERO_STAGE=1 + +config_json="./ds_config.$SLURM_JOBID.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat <$config_json +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOT + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${ZERO_STAGE} \ + --deepspeed-activation-checkpointing \ + " + +export LAUNCHER="python -u -m torch.distributed.launch \ + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT \ + " + +export CMD=" \ + $(pwd)/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --data-impl mmap \ + --split 949,50,1 \ + --distributed-backend nccl \ + $DEEPSPEED_ARGS \ + " + +# # clear old checkpoint as it'd mismatch while we sort things out +# rm -rf $SAVE_CHECKPOINT_PATH + +echo $CMD + +# We create the folder where the logs and codecarbon will be stored. +mkdir -p $LOGS_PATH +# Uncomment if you use codecarbon +# mkdir -p $CODECARBON_PATH + +# to debug - add echo (it exits and prints what it would have launched) +srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt From 8dbc37874e69708e0ed2104f6590c654a3b419a5 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 13 Apr 2022 12:32:39 +0200 Subject: [PATCH 07/14] adapt paths and slurm config --- ...r4c-1B3-extrapolation-law-checkpoint-177k.slurm | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm index 4fbe7831..c3aedba7 100644 --- a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm +++ b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --job-name=1B3-rotary-oscar.slurm +#SBATCH --job-name=1B3-rotary-extrapolation-checkpoint-117k #SBATCH --qos=qos_gpu-t3 #SBATCH --nodes=16 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! @@ -7,24 +7,24 @@ #SBATCH --hint=nomultithread # we get physical cores not logical #SBATCH --gres=gpu:4 # number of gpus #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) -#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name -#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/rotary/%x-%j.out #SBATCH --account=six@gpu set -x -e # TODO: modify these for your training setup, just Ctrl-F replace -DATA_OUTPUT_PATH=$six_ALL_CCFRSCRATCH/synched_exps/tr4c-1B3-rotary-oscar +DATA_OUTPUT_PATH=$SCRATCH/synched_exps/tr4c-1B3-rotary-oscar CHECKPOINT_PATH=$DATA_OUTPUT_PATH/checkpoints REPO_PATH=$DATA_OUTPUT_PATH/tr4c-1B3-rotary-oscar-logs TENSORBOARD_PATH=$REPO_PATH/tensorboard CODECARBON_PATH=$REPO_PATH/codecarbon LOGS_PATH=$REPO_PATH/logs -MEGATRON_DEEPSPEED_REPO=$SCRATCH/repos/Megatron-DeepSpeed +VAL_LOGS_PATH=$REPO_PATH/val-logs +MEGATRON_DEEPSPEED_REPO=$SCRATCH/checkpoints/tr7d-1B3-alibi/code/Megatron-DeepSpeed # use code fixed alibi # TODO: you may change the dataset, some examples are at tr3-1B3-baseline (tr3 = c4 + t5-tokenizer, tr3m = the Pile) -VOCAB_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-vocab.json -MERGE_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-merges.txt +VOCAB_FILE=$SCRATCH/checkpoints/tr7d-1B3-alibi/data/gpt2-vocab.json +MERGE_FILE=$SCRATCH/checkpoints/tr7d-1B3-alibi/data/gpt2-merges.txt DATA_PATH=$six_ALL_CCFRWORK/datasets-custom/oscar-en/meg-gpt2_text_document # defining the right environment variables From cd41b7a9eff976c75099f7d43c353b40ed57df31 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 13 Apr 2022 12:34:51 +0200 Subject: [PATCH 08/14] change eval --- .../tr4c-1B3-extrapolation-law-checkpoint-177k.slurm | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm index c3aedba7..829d4bcf 100644 --- a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm +++ b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm @@ -72,19 +72,25 @@ OPTIMIZER_ARGS=" \ --lr-warmup-samples 183_105 \ --clip-grad 1.0 \ --weight-decay 1e-1 \ + --eval-only 1\ " EXIT_OPTS=" \ --exit-duration-in-mins 1190 \ " -GPT_ARGS=" \ +for increment in {0..4000..100}; do + SEQ_LEN_2=$(($increment + $SEQ_LEN)) + echo "***** Extrapolation for a seq length of $SEQ_LEN_2 *****" + + GPT_ARGS=" \ --num-layers $NLAYERS \ --hidden-size $NHIDDEN \ --num-attention-heads $NHEADS \ --ffn-hidden-size $FFN_HIDDEN_SIZE \ - --seq-length $SEQ_LEN \ + --seq-length $SEQ_LEN_2 \ --position-embedding-type rotary \ + --max-position-embeddings $SEQ_LEN_2 \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --rampup-batch-size 32 32 2_000_000 \ @@ -99,7 +105,7 @@ GPT_ARGS=" \ $EXIT_OPTS \ " -OUTPUT_ARGS=" \ + OUTPUT_ARGS=" \ --log-interval 200 \ --save-interval $SAVE_INTERVAL \ --eval-interval 1000 \ From 6b8a26fda9135ffcffc72b7d8a358537b6aa2166 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 13 Apr 2022 12:35:29 +0200 Subject: [PATCH 09/14] indentation --- ...B3-extrapolation-law-checkpoint-177k.slurm | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm index 829d4bcf..a33874d8 100644 --- a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm +++ b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm @@ -116,51 +116,51 @@ for increment in {0..4000..100}; do --log-batch-size-to-tensorboard \ --log-validation-ppl-to-tensorboard \ " -# TODO: Add --codecarbon-dir $CODECARBON_PATH \ if you want to use codecarbon, not adding it for now to make the current -# series of experiments consistent, especially speed-wise. Adding it once Tr6 and Tr7 are done + # TODO: Add --codecarbon-dir $CODECARBON_PATH \ if you want to use codecarbon, not adding it for now to make the current + # series of experiments consistent, especially speed-wise. Adding it once Tr6 and Tr7 are done -ZERO_STAGE=1 + ZERO_STAGE=1 -config_json="./ds_config.$SLURM_JOBID.json" + config_json="./ds_config.$SLURM_JOBID.json" -# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() -cat <$config_json + # Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() + cat < $config_json { - "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, - "train_batch_size": $GLOBAL_BATCH_SIZE, - "gradient_clipping": 1.0, - "zero_optimization": { +"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, +"train_batch_size": $GLOBAL_BATCH_SIZE, +"gradient_clipping": 1.0, +"zero_optimization": { "stage": $ZERO_STAGE - }, - "fp16": { +}, +"fp16": { "enabled": true, "loss_scale": 0, "loss_scale_window": 500, "hysteresis": 2, "min_loss_scale": 1, "initial_scale_power": 12 - }, - "steps_per_print": 2000, - "wall_clock_breakdown": false +}, +"steps_per_print": 2000, +"wall_clock_breakdown": false } EOT -DEEPSPEED_ARGS=" \ + DEEPSPEED_ARGS=" \ --deepspeed \ --deepspeed_config ${config_json} \ --zero-stage ${ZERO_STAGE} \ --deepspeed-activation-checkpointing \ " -export LAUNCHER="python -u -m torch.distributed.launch \ + export LAUNCHER="python -u -m torch.distributed.launch \ --nproc_per_node $GPUS_PER_NODE \ --nnodes $NNODES \ --master_addr $MASTER_ADDR \ --master_port $MASTER_PORT \ " -export CMD=" \ - $(pwd)/pretrain_gpt.py \ + export CMD=" \ + `pwd`/pretrain_gpt.py \ --tensor-model-parallel-size $TP_SIZE \ --pipeline-model-parallel-size $PP_SIZE \ $GPT_ARGS \ From 4b40871f2d58a39133257f85ee773d7be969ecb2 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 13 Apr 2022 12:36:50 +0200 Subject: [PATCH 10/14] last adaptation --- ...1B3-extrapolation-law-checkpoint-177k.slurm | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm index a33874d8..655b4ff1 100644 --- a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm +++ b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm @@ -171,18 +171,14 @@ EOT --data-impl mmap \ --split 949,50,1 \ --distributed-backend nccl \ - $DEEPSPEED_ARGS \ + $DEEPSPEED_ARGS \ " -# # clear old checkpoint as it'd mismatch while we sort things out -# rm -rf $SAVE_CHECKPOINT_PATH + # # clear old checkpoint as it'd mismatch while we sort things out + # rm -rf $SAVE_CHECKPOINT_PATH -echo $CMD + echo $CMD -# We create the folder where the logs and codecarbon will be stored. -mkdir -p $LOGS_PATH -# Uncomment if you use codecarbon -# mkdir -p $CODECARBON_PATH - -# to debug - add echo (it exits and prints what it would have launched) -srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt + # to debug - add echo (it exits and prints what it would have launched) + srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $VAL_LOGS_PATH/tr4c-1B3-rotary-extrapolation.$SLURM_JOBID.out + done From 734afb9a4e75087bff0cfab18a1830d99f2c5472 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 13 Apr 2022 14:20:15 +0200 Subject: [PATCH 11/14] rotary need max-position-embeddings args set to none --- .../tr4c-1B3-extrapolation-law-checkpoint-177k.slurm | 1 - 1 file changed, 1 deletion(-) diff --git a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm index 655b4ff1..345d4de9 100644 --- a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm +++ b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm @@ -90,7 +90,6 @@ for increment in {0..4000..100}; do --ffn-hidden-size $FFN_HIDDEN_SIZE \ --seq-length $SEQ_LEN_2 \ --position-embedding-type rotary \ - --max-position-embeddings $SEQ_LEN_2 \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --rampup-batch-size 32 32 2_000_000 \ From e938721124a1a74f96ff9ab4e1f37dcbd9ff260d Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 13 Apr 2022 15:17:04 +0200 Subject: [PATCH 12/14] typo --- .../tr4c-1B3-extrapolation-law-checkpoint-177k.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm index 345d4de9..d9d0f3bf 100644 --- a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm +++ b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm @@ -180,4 +180,4 @@ EOT # to debug - add echo (it exits and prints what it would have launched) srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $VAL_LOGS_PATH/tr4c-1B3-rotary-extrapolation.$SLURM_JOBID.out - done +done From a2acc4b133949e462480d1e377ee267820d4d002 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 13 Apr 2022 17:59:36 +0200 Subject: [PATCH 13/14] try --- .../tr4c-1B3-extrapolation-law-checkpoint-177k.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm index d9d0f3bf..24b7637a 100644 --- a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm +++ b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm @@ -79,7 +79,7 @@ EXIT_OPTS=" \ --exit-duration-in-mins 1190 \ " -for increment in {0..4000..100}; do +for increment in {100..4000..100}; do SEQ_LEN_2=$(($increment + $SEQ_LEN)) echo "***** Extrapolation for a seq length of $SEQ_LEN_2 *****" From de62cfe2ad455328eb3e96c9fa2604f5c0356f6b Mon Sep 17 00:00:00 2001 From: SaulLu Date: Thu, 14 Apr 2022 09:54:37 +0200 Subject: [PATCH 14/14] fix --- .../tr4c-1B3-extrapolation-law-checkpoint-177k.slurm | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm index 24b7637a..9fe0b55e 100644 --- a/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm +++ b/train/tr4-1B3-rotary/tr4c-1B3-extrapolation-law-checkpoint-177k.slurm @@ -37,6 +37,8 @@ export HF_DATASETS_OFFLINE=1 export TRANSFORMERS_OFFLINE=1 cd $MEGATRON_DEEPSPEED_REPO +mkdir -p $VAL_LOGS_PATH + # so processes know who to talk to MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) MASTER_PORT=6000 @@ -79,7 +81,7 @@ EXIT_OPTS=" \ --exit-duration-in-mins 1190 \ " -for increment in {100..4000..100}; do +for increment in {0..4000..100}; do SEQ_LEN_2=$(($increment + $SEQ_LEN)) echo "***** Extrapolation for a seq length of $SEQ_LEN_2 *****"