-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathtrain_long.sh
More file actions
executable file
·36 lines (31 loc) · 1.04 KB
/
train_long.sh
File metadata and controls
executable file
·36 lines (31 loc) · 1.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/bin/bash
set -euo pipefail
# Always run from repo root so relative paths in configs/code are valid.
cd "$(dirname "$0")"
CONFIG=configs/diadistill_train_long.yaml
RUN_TS="$(date +%Y%m%d_%H%M%S)"
OUTPUT_ROOT="outputs"
LOGDIR="${OUTPUT_ROOT}/${RUN_TS}"
WANDB_SAVE_DIR="$LOGDIR"
MASTER_PORT="${MASTER_PORT:-$((20000 + RANDOM % 20000))}"
export WANDB_MODE=disabled
mkdir -p "$LOGDIR"
cp "$CONFIG" "$LOGDIR/train_config_input.yaml"
{
echo "timestamp_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ)"
echo "pwd=$(pwd)"
echo "config=$CONFIG"
echo "master_port=$MASTER_PORT"
echo "command=torchrun --nproc_per_node=8 --master_port $MASTER_PORT train.py --config_path $CONFIG --logdir $LOGDIR --wandb-save-dir $WANDB_SAVE_DIR --disable-wandb"
} > "$LOGDIR/launch_info.txt"
echo "CONFIG=$CONFIG"
echo "RUN_DIR=$LOGDIR"
echo "MASTER_PORT=$MASTER_PORT"
torchrun \
--nproc_per_node=8 \
--master_port "$MASTER_PORT" \
train.py \
--config_path "$CONFIG" \
--logdir "$LOGDIR" \
--wandb-save-dir "$WANDB_SAVE_DIR" \
--disable-wandb 2>&1 | tee "$LOGDIR/train.log"