Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions launch_leadercore_ablation_runpod.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/bin/bash
set -euo pipefail

VARIANT="${1:-base}"
DATA_ROOT_MODE="${DATA_ROOT_MODE:-workspace}"

cd /workspace/parameter-golf

SCRIPT_PATH="/workspace/parameter-golf/records/track_10min_16mb/2026-03-20_LeaderCore10L_ValidEval_TempOnly_Int8Search/train_gpt.py"
RECORD_ROOT="/workspace/parameter-golf/records/track_10min_16mb/2026-03-20_LeaderCore10L_ValidEval_TempOnly_Int8Search"

case "$DATA_ROOT_MODE" in
workspace)
export DATA_PATH=/workspace/parameter-golf/data/datasets/fineweb10B_sp1024
export TOKENIZER_PATH=/workspace/parameter-golf/data/tokenizers/fineweb_1024_bpe.model
;;
tmp)
export DATA_PATH=/tmp/parameter-golf-data/datasets/fineweb10B_sp1024
export TOKENIZER_PATH=/tmp/parameter-golf-data/tokenizers/fineweb_1024_bpe.model
;;
*)
echo "DATA_ROOT_MODE must be one of: workspace, tmp"
exit 1
;;
esac

export MAX_WALLCLOCK_SECONDS=600
export VAL_LOSS_EVERY=0
export TRAIN_LOG_EVERY=200
export VOCAB_SIZE=1024
export INT8_KEEP_TOK_EMB_FP16=1

case "$VARIANT" in
base)
export RUN_ID=leadercore10l_valid_base
OUT_DIR="$RECORD_ROOT/runpod_${DATA_ROOT_MODE}_base"
;;
embedlr08)
export RUN_ID=leadercore10l_valid_embedlr08
export TIED_EMBED_LR=0.08
OUT_DIR="$RECORD_ROOT/runpod_${DATA_ROOT_MODE}_embedlr08"
;;
matrixlr005)
export RUN_ID=leadercore10l_valid_matrixlr005
export MATRIX_LR=0.05
OUT_DIR="$RECORD_ROOT/runpod_${DATA_ROOT_MODE}_matrixlr005"
;;
warmdown1800)
export RUN_ID=leadercore10l_valid_warmdown1800
export WARMDOWN_ITERS=1800
OUT_DIR="$RECORD_ROOT/runpod_${DATA_ROOT_MODE}_warmdown1800"
;;
tokemb_int8)
export RUN_ID=leadercore10l_valid_tokemb_int8
export INT8_KEEP_TOK_EMB_FP16=0
OUT_DIR="$RECORD_ROOT/runpod_${DATA_ROOT_MODE}_tokemb_int8"
;;
*)
echo "Usage: $0 {base|embedlr08|matrixlr005|warmdown1800|tokemb_int8}"
exit 1
;;
esac

mkdir -p "$OUT_DIR"

nohup python3 -m torch.distributed.run --standalone --nproc_per_node=8 "$SCRIPT_PATH" \
> "$OUT_DIR/train.log" 2>&1 < /dev/null &
PID=$!
echo "$PID" > "$OUT_DIR/train.pid"
echo "started $RUN_ID data_root_mode=$DATA_ROOT_MODE pid=$PID log=$OUT_DIR/train.log"
109 changes: 109 additions & 0 deletions launch_leadercore_screen_runpod.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/bin/bash
set -euo pipefail

VARIANT="${1:-base}"
DATA_ROOT_MODE="${DATA_ROOT_MODE:-tmp}"
SCREEN_SECONDS="${SCREEN_SECONDS:-180}"
NPROC_PER_NODE="${NPROC_PER_NODE:-1}"

cd /workspace/parameter-golf

SCRIPT_PATH="/workspace/parameter-golf/records/track_10min_16mb/2026-03-20_LeaderCore10L_ValidEval_TempOnly_Int8Search/train_gpt.py"
RECORD_ROOT="/workspace/parameter-golf/records/track_10min_16mb/2026-03-20_LeaderCore10L_ValidEval_TempOnly_Int8Search"

case "$DATA_ROOT_MODE" in
workspace)
export DATA_PATH=/workspace/parameter-golf/data/datasets/fineweb10B_sp1024
export TOKENIZER_PATH=/workspace/parameter-golf/data/tokenizers/fineweb_1024_bpe.model
;;
tmp)
export DATA_PATH=/tmp/parameter-golf-data/datasets/fineweb10B_sp1024
export TOKENIZER_PATH=/tmp/parameter-golf-data/tokenizers/fineweb_1024_bpe.model
;;
*)
echo "DATA_ROOT_MODE must be one of: workspace, tmp"
exit 1
;;
esac

export MAX_WALLCLOCK_SECONDS="$SCREEN_SECONDS"
export VAL_LOSS_EVERY=0
export TRAIN_LOG_EVERY=25
export VOCAB_SIZE=1024
export INT8_KEEP_TOK_EMB_FP16=1
export PROXY_SKIP_EXPORT=1

case "$VARIANT" in
base)
export RUN_ID=leadercore10l_screen_base
OUT_DIR="$RECORD_ROOT/screen_${DATA_ROOT_MODE}_base"
;;
embedlr08)
export RUN_ID=leadercore10l_screen_embedlr08
export TIED_EMBED_LR=0.08
OUT_DIR="$RECORD_ROOT/screen_${DATA_ROOT_MODE}_embedlr08"
;;
matrixlr005)
export RUN_ID=leadercore10l_screen_matrixlr005
export MATRIX_LR=0.05
OUT_DIR="$RECORD_ROOT/screen_${DATA_ROOT_MODE}_matrixlr005"
;;
matrixlr006)
export RUN_ID=leadercore10l_screen_matrixlr006
export MATRIX_LR=0.06
OUT_DIR="$RECORD_ROOT/screen_${DATA_ROOT_MODE}_matrixlr006"
;;
warmdown1800)
export RUN_ID=leadercore10l_screen_warmdown1800
export WARMDOWN_ITERS=1800
OUT_DIR="$RECORD_ROOT/screen_${DATA_ROOT_MODE}_warmdown1800"
;;
warmdown800)
export RUN_ID=leadercore10l_screen_warmdown800
export WARMDOWN_ITERS=800
OUT_DIR="$RECORD_ROOT/screen_${DATA_ROOT_MODE}_warmdown800"
;;
gradclip03)
export RUN_ID=leadercore10l_screen_gradclip03
export GRAD_CLIP_NORM=0.3
OUT_DIR="$RECORD_ROOT/screen_${DATA_ROOT_MODE}_gradclip03"
;;
muon099)
export RUN_ID=leadercore10l_screen_muon099
export MUON_MOMENTUM=0.99
OUT_DIR="$RECORD_ROOT/screen_${DATA_ROOT_MODE}_muon099"
;;
warmdown800_matrixlr006)
export RUN_ID=leadercore10l_screen_warmdown800_matrixlr006
export WARMDOWN_ITERS=800
export MATRIX_LR=0.06
OUT_DIR="$RECORD_ROOT/screen_${DATA_ROOT_MODE}_warmdown800_matrixlr006"
;;
muonwarm300)
export RUN_ID=leadercore10l_screen_muonwarm300
export MUON_MOMENTUM_WARMUP_STEPS=300
OUT_DIR="$RECORD_ROOT/screen_${DATA_ROOT_MODE}_muonwarm300"
;;
qkgain17)
export RUN_ID=leadercore10l_screen_qkgain17
export QK_GAIN_INIT=1.7
OUT_DIR="$RECORD_ROOT/screen_${DATA_ROOT_MODE}_qkgain17"
;;
width520)
export RUN_ID=leadercore10l_screen_width520
export MODEL_DIM=520
OUT_DIR="$RECORD_ROOT/screen_${DATA_ROOT_MODE}_width520"
;;
*)
echo "Usage: $0 {base|embedlr08|matrixlr005|matrixlr006|warmdown1800|warmdown800|warmdown800_matrixlr006|gradclip03|muon099|muonwarm300|qkgain17|width520}"
exit 1
;;
esac

mkdir -p "$OUT_DIR"

nohup python3 -m torch.distributed.run --standalone --nproc_per_node="$NPROC_PER_NODE" "$SCRIPT_PATH" \
> "$OUT_DIR/train.log" 2>&1 < /dev/null &
PID=$!
echo "$PID" > "$OUT_DIR/train.pid"
echo "started $RUN_ID data_root_mode=$DATA_ROOT_MODE nproc_per_node=$NPROC_PER_NODE screen_seconds=$SCREEN_SECONDS pid=$PID log=$OUT_DIR/train.log"
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
This directory contains the leader-core merge candidate rooted in the official `2026-03-19_SlidingWindow_FP16Emb_10L_MuonWD_OvertoneInit` training recipe, with validity-safe final eval and the stronger local int8 export plus temperature-only post-quant search.

Primary file:
- `train_gpt.py`

Saved full 8xH100 parity result:
- `runpod_tmp_base`: `final_int8_zlib_roundtrip_exact val_bpb: 1.20639536`
- `runpod_tmp_base`: `Total submission size int8+zlib: 15294320 bytes`
- `runpod_tmp_base`: `step_avg: 49.31ms`

Saved 1xH100 proxy screens on the same line:
- `screen_tmp_base`: `val_bpb: 1.7975`, `step_avg: 472.64ms`
- `screen_tmp_gradclip03`: `val_bpb: 1.7464`, `step_avg: 477.81ms`
- `screen_tmp_matrixlr006`: `val_bpb: 1.6751`, `step_avg: 471.55ms`
- `screen_tmp_muon099`: `val_bpb: 1.6582`, `step_avg: 416.73ms`
- `screen_tmp_warmdown800`: `val_bpb: 1.5984`, `step_avg: 523.52ms`
- `screen_tmp_warmdown800_matrixlr006`: `val_bpb: 1.4874`, `step_avg: 423.34ms`
- `screen_tmp_embedlr08`: `val_bpb: 1.8000`, `step_avg: 469.15ms`

Current proxy read:
- strongest single change: `warmdown800`
- strongest low-risk single change: `matrixlr006`
- strongest tested combination so far: `warmdown800 + matrixlr006`

First ablation batch:
- `base`
- `embedlr08`
- `matrixlr005`
- `warmdown1800`
- `tokemb_int8`

RunPod launcher:
```bash
bash /workspace/parameter-golf/launch_leadercore_ablation_runpod.sh base
```

For timing-parity runs, first stage data/tokenizer to local disk on the pod:
```bash
bash /workspace/parameter-golf/setup_local_parity_data_runpod.sh
DATA_ROOT_MODE=tmp bash /workspace/parameter-golf/launch_leadercore_ablation_runpod.sh base
```

Variant names map to these env changes:
- `base`: no overrides beyond the merged defaults
- `embedlr08`: `TIED_EMBED_LR=0.08`
- `matrixlr005`: `MATRIX_LR=0.05`
- `matrixlr006`: `MATRIX_LR=0.06`
- `warmdown1800`: `WARMDOWN_ITERS=1800`
- `warmdown800`: `WARMDOWN_ITERS=800`
- `warmdown800_matrixlr006`: `WARMDOWN_ITERS=800`, `MATRIX_LR=0.06`
- `gradclip03`: `GRAD_CLIP_NORM=0.3`
- `muon099`: `MUON_MOMENTUM=0.99`
- `tokemb_int8`: `INT8_KEEP_TOK_EMB_FP16=0`

Data root modes:
- `workspace`: use `/workspace/parameter-golf/data/...`
- `tmp`: use `/tmp/parameter-golf-data/...` after local staging
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#!/bin/bash
set -euo pipefail

VARIANT="${1:-base}"
DATA_ROOT_MODE="${DATA_ROOT_MODE:-tmp}"
SCREEN_SECONDS="${SCREEN_SECONDS:-180}"
NPROC_PER_NODE="${NPROC_PER_NODE:-1}"

cd /workspace/parameter-golf

SCRIPT_PATH="/workspace/parameter-golf/records/track_10min_16mb/2026-03-20_LeaderCore10L_ValidEval_TempOnly_Int8Search/train_gpt.py"
RECORD_ROOT="/workspace/parameter-golf/records/track_10min_16mb/2026-03-20_LeaderCore10L_ValidEval_TempOnly_Int8Search"

case "$DATA_ROOT_MODE" in
workspace)
export DATA_PATH=/workspace/parameter-golf/data/datasets/fineweb10B_sp1024
export TOKENIZER_PATH=/workspace/parameter-golf/data/tokenizers/fineweb_1024_bpe.model
;;
tmp)
export DATA_PATH=/tmp/parameter-golf-data/datasets/fineweb10B_sp1024
export TOKENIZER_PATH=/tmp/parameter-golf-data/tokenizers/fineweb_1024_bpe.model
;;
*)
echo "DATA_ROOT_MODE must be one of: workspace, tmp"
exit 1
;;
esac

export MAX_WALLCLOCK_SECONDS="$SCREEN_SECONDS"
export VAL_LOSS_EVERY=0
export TRAIN_LOG_EVERY=25
export VOCAB_SIZE=1024
export INT8_KEEP_TOK_EMB_FP16=1
export PROXY_SKIP_EXPORT=1

case "$VARIANT" in
base)
export RUN_ID=leadercore10l_screen_base
OUT_DIR="$RECORD_ROOT/screen_${DATA_ROOT_MODE}_base"
;;
embedlr08)
export RUN_ID=leadercore10l_screen_embedlr08
export TIED_EMBED_LR=0.08
OUT_DIR="$RECORD_ROOT/screen_${DATA_ROOT_MODE}_embedlr08"
;;
matrixlr005)
export RUN_ID=leadercore10l_screen_matrixlr005
export MATRIX_LR=0.05
OUT_DIR="$RECORD_ROOT/screen_${DATA_ROOT_MODE}_matrixlr005"
;;
matrixlr006)
export RUN_ID=leadercore10l_screen_matrixlr006
export MATRIX_LR=0.06
OUT_DIR="$RECORD_ROOT/screen_${DATA_ROOT_MODE}_matrixlr006"
;;
warmdown1800)
export RUN_ID=leadercore10l_screen_warmdown1800
export WARMDOWN_ITERS=1800
OUT_DIR="$RECORD_ROOT/screen_${DATA_ROOT_MODE}_warmdown1800"
;;
warmdown800)
export RUN_ID=leadercore10l_screen_warmdown800
export WARMDOWN_ITERS=800
OUT_DIR="$RECORD_ROOT/screen_${DATA_ROOT_MODE}_warmdown800"
;;
gradclip03)
export RUN_ID=leadercore10l_screen_gradclip03
export GRAD_CLIP_NORM=0.3
OUT_DIR="$RECORD_ROOT/screen_${DATA_ROOT_MODE}_gradclip03"
;;
muon099)
export RUN_ID=leadercore10l_screen_muon099
export MUON_MOMENTUM=0.99
OUT_DIR="$RECORD_ROOT/screen_${DATA_ROOT_MODE}_muon099"
;;
muonwarm300)
export RUN_ID=leadercore10l_screen_muonwarm300
export MUON_MOMENTUM_WARMUP_STEPS=300
OUT_DIR="$RECORD_ROOT/screen_${DATA_ROOT_MODE}_muonwarm300"
;;
qkgain17)
export RUN_ID=leadercore10l_screen_qkgain17
export QK_GAIN_INIT=1.7
OUT_DIR="$RECORD_ROOT/screen_${DATA_ROOT_MODE}_qkgain17"
;;
width520)
export RUN_ID=leadercore10l_screen_width520
export MODEL_DIM=520
OUT_DIR="$RECORD_ROOT/screen_${DATA_ROOT_MODE}_width520"
;;
*)
echo "Usage: $0 {base|embedlr08|matrixlr005|matrixlr006|warmdown1800|warmdown800|gradclip03|muon099|muonwarm300|qkgain17|width520}"
exit 1
;;
esac

mkdir -p "$OUT_DIR"

nohup python3 -m torch.distributed.run --standalone --nproc_per_node="$NPROC_PER_NODE" "$SCRIPT_PATH" \
> "$OUT_DIR/train.log" 2>&1 < /dev/null &
PID=$!
echo "$PID" > "$OUT_DIR/train.pid"
echo "started $RUN_ID data_root_mode=$DATA_ROOT_MODE nproc_per_node=$NPROC_PER_NODE screen_seconds=$SCREEN_SECONDS pid=$PID log=$OUT_DIR/train.log"
Loading