-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathrun.bash
More file actions
74 lines (66 loc) · 1.88 KB
/
run.bash
File metadata and controls
74 lines (66 loc) · 1.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
##################
# Configuration
DATASET="wikitext-103-v1"
NUM_NODES=8
DEVICES="0,1,2,3,4,5,6,7"
BATCH_SIZE=8
NUM_MICROBATCHES=1
BLOCK_SIZE=1024
N_EMBD=768
N_LAYER=12
N_HEAD=12
STAGES="3,3,3,3" # 4 x 2 mesh
EXPNAME="$DATASET-base-pp4-dp2"
CHECKPOINT_DIR="checkpoints/$EXPNAME"
WANDB_PROJECT="$EXPNAME"
# Add a directory to the Python path
export PYTHONPATH="${PYTHONPATH}:./"
# Command string
basecmdstr="python ./examples/pp_diloco_async.py \
--dataset $DATASET \
--num_nodes $NUM_NODES \
--devices $DEVICES \
--batch_size $BATCH_SIZE \
--num_microbatches $NUM_MICROBATCHES \
--block_size $BLOCK_SIZE \
--n_embd $N_EMBD \
--n_layer $N_LAYER \
--n_head $N_HEAD \
--stages $STAGES \
--checkpoint_dir $CHECKPOINT_DIR \
--wandb_project $WANDB_PROJECT \
--p_sparta 0.05 --beta1 0.99 --async_sparta_delay 10"
# Ours
cmdstr="$basecmdstr --sparta_method ema --wandb_name AsyncPP-Ours &"
echo $cmdstr; eval $cmdstr
wait
# # DP
# cmdstr="$basecmdstr --p_sparta 1.0 --async_sparta_delay 0 --wandb_name AsyncPP-DP &"
# echo $cmdstr; eval $cmdstr
# wait
# # SPARTA
# cmdstr="$basecmdstr --async_sparta_delay 0 --wandb_name AsyncPP-SPARTA &"
# echo $cmdstr; eval $cmdstr
# wait
# # AsyncSPARTA
# cmdstr="$basecmdstr --wandb_name AsyncPP-AsyncSPARTA &"
# echo $cmdstr; eval $cmdstr
# wait
# # Command string
# basecmdstr="python ./examples/pp_diloco_sync.py \
# --dataset $DATASET \
# --num_nodes $NUM_NODES \
# --devices $DEVICES \
# --batch_size $BATCH_SIZE \
# --num_microbatches $NUM_MICROBATCHES \
# --block_size $BLOCK_SIZE \
# --n_embd $N_EMBD \
# --n_layer $N_LAYER \
# --n_head $N_HEAD \
# --stages $STAGES \
# --checkpoint_dir $CHECKPOINT_DIR \
# --wandb_project $WANDB_PROJECT"
# # FullSync
# cmdstr="$basecmdstr --p_sparta 1.0 --async_sparta_delay 0 --wandb_name FullSync &"
# echo $cmdstr; eval $cmdstr
# wait