diff --git a/AMG2023/README.md b/AMG2023/README.md index 476ad56..14c75c8 100644 --- a/AMG2023/README.md +++ b/AMG2023/README.md @@ -1,9 +1,9 @@ # AMG2023 README For more detailed installation parameters, please refer to the [installation document](https://github.com/pssg-int/AMG2023/blob/main/amg-doc.pdf). -## Perlmutter Compilation +Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/) -Repository: [AMG2023](https://github.com/pssg-int/AMG2023) +## Perlmutter Compilation ### Steps to Compile @@ -50,5 +50,61 @@ Repository: [AMG2023](https://github.com/pssg-int/AMG2023) cmake -DHYPRE_PREFIX=/pscratch/sd/c/cunyang/AMG2023 .. ``` -## Frontier Installation +## Frontier Compilation + +### Steps to Compile + +1. Load modules + ```sh + module reset + + module load cray-mpich/8.1.30 + module load craype-accel-amd-gfx90a + module load rocm/6.1.3 + export MPICH_GPU_SUPPORT_ENABLED=1 + + # load compatible cmake version + module load Core/24.07 + module load cmake/3.27.9 + ``` +2. Configure hypre (v2.32.0) + - Clone hypre v2.32.0 and navigate to src: + ```sh + git clone -b v2.32.0 https://github.com/hypre-space/hypre.git + cd into ~/hypre/src + ``` + - Configure hypre (in hypre/src) + ```sh + ./configure --with-hip --enable-device-memory-pool --enable-mixedint --with-gpu-arch=gfx90a \ + --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" \ + --with-MPI-include="${MPICH_DIR}/include" \ + CFLAGS="-I${ROCM_PATH}/include/ -I${ROCM_PATH}/llvm/include/ -I${ROCM_PATH}/include/rocsparse/" \ + LDFLAGS="-L${ROCM_PATH}/lib/ -L${ROCM_PATH}/llvm/lib/ -lrocsparse" + ``` + - Compile hypre (in hypre/src) + ```sh + # build with make + make + ``` +3. Configure AMG2023 + - Clone repo: + ```sh + git clone https://github.com/pssg-int/AMG2023` + cd AMG2023 + ``` + - Add mpiP to LD_LIBRARY_PATH + ```sh + export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH + ``` + - Configure cmake + ```sh + mkdir build && cd build + cmake .. -DHYPRE_PREFIX=/ccs/home/keshprad/hypre/src/hypre/ \ + -DCMAKE_C_FLAGS="-I${ROCM_PATH}/include/ -I${ROCM_PATH}/llvm/include/ -I${ROCM_PATH}/include/rocsparse/" \ + -DCMAKE_EXE_LINKER_FLAGS="-L${ROCM_PATH}/lib/ -L${ROCM_PATH}/llvm/lib/ -lrocsparse -lrocrand" + ``` + - Compile AMG2023 (in AMG2023/build) + ```sh + make install + ``` diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh new file mode 100644 index 0000000..c51b52d --- /dev/null +++ b/AMG2023/run_frontier_16.sh @@ -0,0 +1,57 @@ +#!/bin/bash +#SBATCH -N 16 +#SBATCH -n 128 +#SBATCH -q normal +#SBATCH -J amg +#SBATCH --gpu-bind none +#SBATCH -t 00:30:00 +#SBATCH -A csc569 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/%x-%j/output-AMG2023.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/%x-%j/error-AMG2023.log +#SBATCH --exclusive +# Run like: sbatch run_frontier_16.sh + +OUTPUT_DIR=/lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +OUTPUT_FILE=$OUTPUT_DIR/output-AMG2023.log +ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log + +# Run gpu benchmarks +COMM_TYPE=mpi +ROCM_VERSION=6.1.3 +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR + +APP_ROOT=/ccs/home/keshprad/AMG2023 +cd $APP_ROOT + +# reset modules +echo resetting modules: +module reset +# load modules +echo loading modules: +module load cray-mpich/8.1.30 +module load craype-accel-amd-gfx90a +module load rocm/6.1.3 + +export MPICH_GPU_SUPPORT_ENABLED=1 +export CRAY_ACCEL_TARGET=gfx90a +export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ +# mpiP +export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH +export MPIP="-o -f $OUTPUT_DIR" + +# log start date +echo start AMG2023: $(date) +# define command +cmd="srun --output $OUTPUT_FILE --error $ERROR_FILE \ + ./build/amg -P 4 4 8 -n 128 64 64 -problem 1 -iter 500" +echo solving: +echo $cmd +$cmd +# log end date +echo end AMG2023: $(date) diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh new file mode 100644 index 0000000..c7a7a3e --- /dev/null +++ b/AMG2023/run_frontier_64.sh @@ -0,0 +1,57 @@ +#!/bin/bash +#SBATCH -N 64 +#SBATCH -n 512 +#SBATCH -q normal +#SBATCH -J amg +#SBATCH --gpu-bind none +#SBATCH -t 00:30:00 +#SBATCH -A csc569 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/%x-%j/output-AMG2023.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/%x-%j/error-AMG2023.log +#SBATCH --exclusive +# Run like: sbatch run_frontier_64.sh + +OUTPUT_DIR=/lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +OUTPUT_FILE=$OUTPUT_DIR/output-AMG2023.log +ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log + +# Run gpu benchmarks +COMM_TYPE=mpi +ROCM_VERSION=6.1.3 +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR + +APP_ROOT=/ccs/home/keshprad/AMG2023 +cd $APP_ROOT + +# reset modules +echo resetting modules: +module reset +# load modules +echo loading modules: +module load cray-mpich/8.1.30 +module load craype-accel-amd-gfx90a +module load rocm/6.1.3 + +export MPICH_GPU_SUPPORT_ENABLED=1 +export CRAY_ACCEL_TARGET=gfx90a +export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ +# mpiP +export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH +export MPIP="-o -f $OUTPUT_DIR" + +# log start date +echo start AMG2023: $(date) +# define command +cmd="srun --output $OUTPUT_FILE --error $ERROR_FILE \ + ./build/amg -P 8 8 8 -n 128 64 64 -problem 1 -iter 500" +echo solving: +echo $cmd +$cmd +# log end date +echo end AMG2023: $(date) diff --git a/AMG2023/run_frontier_crontab.sh b/AMG2023/run_frontier_crontab.sh new file mode 100644 index 0000000..09b0f66 --- /dev/null +++ b/AMG2023/run_frontier_crontab.sh @@ -0,0 +1,19 @@ +#!/bin/bash +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi +# `16` or `64` +NUM_NODES=$1 + +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability + +# load lmod +source /usr/share/lmod/lmod/init/bash +# load default LMOD_SYSTEM_DEFAULT_MODULES and MODULEPATH +export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-trento:craype-network-ofi:perftools-base:xpmem:cray-pmi:PrgEnv-cray:DefApps +export MODULEPATH=/sw/frontier/spack-envs/modules/cce/17.0.0/cray-mpich-8.1.28/cce-17.0.0:/sw/frontier/spack-envs/modules/cce/17.0.0/cce-17.0.0:/sw/frontier/spack-envs/modules/Core/24.07:/opt/cray/pe/lmod/modulefiles/mpi/crayclang/17.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/crayclang/17.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/compiler/crayclang/17.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-trento/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/sw/frontier/modulefiles:/opt/cray/modulefiles + +# run sbatch script +script=$PERF_VARIABILITY_ROOT/AMG2023/run_frontier_$NUM_NODES\.sh +sbatch $script \ No newline at end of file diff --git a/DeepCAM/README.md b/DeepCAM/README.md new file mode 100644 index 0000000..94e6880 --- /dev/null +++ b/DeepCAM/README.md @@ -0,0 +1,131 @@ +# DeepCAM README +For more detailed installation parameters, please refer to DeepCAM install guide + +Perlmutter Repository: [hpc_results_v3.0](https://github.com/hpcgroup/hpc_results_v3.0) +Frontier Repository: [hpc](https://github.com/hpcgroup/hpc) + + +## Perlmutter Setup + +### Setup steps + +## Frontier Setup + +### Setup steps + +#### 1. Pytorch Install +- Load modules + ```bash + module reset + module load PrgEnv-gnu/8.5.0 + module load rocm/6.1.3 + module load craype-accel-amd-gfx90a + module load cray-python/3.9.13.1 +- Create env variables + ```bash + DEEPCAM_ROOT=/lustre/orion/csc569/scratch/keshprad/deepcam/ + PYVENV_ROOT=${DEEPCAM_ROOT}/.venv + PYVENV_SITEPKGS=${PYVENV_ROOT}/lib/python3.9/site-packages + + cd ${DEEPCAM_ROOT} + ``` +- Create python virtual env + ```bash + python -m venv ${PYVENV_ROOT} + source ${PYVENV_ROOT}/bin/activate + ``` +- Install torch and mpi4py + ```bash + # torch==2.5.0 + pip install torch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/rocm6.1 + + MPICC="cc -shared" pip install --no-cache-dir --no-binary=mpi4py mpi4py + ``` +- Install AWS-OCI-RCCL plugin + ```bash + mkdir -p ${DEEPCAM_ROOT}/repos + cd ${DEEPCAM_ROOT}/repos + + rocm_version=6.1.3 + # Load modules + module load PrgEnv-gnu/8.5.0 + module load rocm/$rocm_version + module load craype-accel-amd-gfx90a + module load gcc-native/12.3 + module load cray-mpich/8.1.30 + #module load libtool + libfabric_path=/opt/cray/libfabric/1.15.2.0 + + # Download the plugin repo + git clone --recursive https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl + cd aws-ofi-rccl + + # Build the plugin + ./autogen.sh + export LD_LIBRARY_PATH=/opt/rocm-$rocm_version/hip/lib:$LD_LIBRARY_PATH + PLUG_PREFIX=$PWD + + CC=hipcc CFLAGS=-I/opt/rocm-$rocm_version/rccl/include ./configure \ + --with-libfabric=$libfabric_path --with-rccl=/opt/rocm-$rocm_version --enable-trace \ + --prefix=$PLUG_PREFIX --with-hip=/opt/rocm-$rocm_version/hip --with-mpi=$MPICH_DIR + + make + make install + + # Reminder to export the plugin to your path + echo $PLUG_PREFIX + echo "Add the following line in the environment to use the AWS OFI RCCL plugin" + echo "export LD_LIBRARY_PATH="$PLUG_PREFIX"/lib:$""LD_LIBRARY_PATH" + ``` +- Install supporting dependencies + ```bash + cd ${DEEPCAM_ROOT} + + pip install wandb + pip install gym + pip install pyspark + pip install scikit-learn + pip install scikit-image + pip install opencv-python + pip install wheel + pip install tomli + pip install h5py + + # tensorboard + pip install tensorboard + pip install tensorboard_plugin_profile + pip install tensorboard-plugin-wit + pip install tensorboard-pytorch + + pip install git+https://github.com/ildoonet/pytorch-gradual-warmup-lr.git + ``` +- Install mlperf-logging + ```bash + mkdir -p ${DEEPCAM_ROOT}/repos + cd ${DEEPCAM_ROOT}/repos + + git clone -b hpc-1.0-branch https://github.com/mlcommons/logging mlperf-logging + # may need to manually change mlperf-logging/VERSION to a valid version number (e.g. 1.0.0.rc2) + pip install -e mlperf-logging + + rm ${PYVENV_SITEPKGS}/mlperf-logging.egg-link + cp -r ./mlperf-logging/mlperf_logging ${PYVENV_SITEPKGS}/mlperf_logging + cp -r ./mlperf-logging/mlperf_logging.egg-info ${PYVENV_SITEPKGS}/mlperf_logging.egg-info + ``` + +#### 2. Download src code +- Download from PSSG Frontier repo for DeepCAM (linked at top of README) + ```bash + # REPLACE WITH YOUR PATH + PRFX=/lustre/orion/csc569/scratch/keshprad + DEEPCAM_ROOT=${PRFX}/deepcam + + mkdir -p ${DEEPCAM_ROOT} + cd ${DEEPCAM_ROOT} + + git clone https://github.com/hpcgroup/hpc.git hpc + ``` + +#### 3. Download dataset with globus +- [Globus Link](https://app.globus.org/file-manager?origin_id=0b226e2c-4de0-11ea-971a-021304b0cca7&origin_path=%2F) + - Download to `$DEEPCAM_ROOT/data` \ No newline at end of file diff --git a/DeepCAM/run_frontier_16.sh b/DeepCAM/run_frontier_16.sh new file mode 100644 index 0000000..593608a --- /dev/null +++ b/DeepCAM/run_frontier_16.sh @@ -0,0 +1,132 @@ +#!/bin/bash +#SBATCH -N 16 +#SBATCH -n 128 +#SBATCH -q normal +#SBATCH -J deepcam +#SBATCH --gpu-bind none +#SBATCH -t 00:30:00 +#SBATCH -A csc569 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/16nodes/%x-%j/job-output.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/16nodes/%x-%j/job-error.log +#SBATCH --exclusive +# Run like: sbatch run_frontier_16.sh + +echo "start run: $(date)" +export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +OUTPUT_FILE=${JOB_OUTPUT_PATH}/output-deepcam.log +ERROR_FILE=${JOB_OUTPUT_PATH}/error-deepcam.log + +export SCRATCH="/lustre/orion/csc569/scratch/keshprad" +export APP_ROOT="${SCRATCH}/deepcam" +APP_WORKING_DIR=${APP_ROOT}/hpc/deepcam/src/deepCam +cd $APP_WORKING_DIR + +# load modules +ROCM_VERSION=6.1.3 +echo resetting modules: +module reset +echo loading modules: +module load PrgEnv-gnu/8.5.0 +module load rocm/6.1.3 +module load craype-accel-amd-gfx90a +module load cray-python/3.9.13.1 +module list + +# activate virtual env +echo activating virtual env: +source ${APP_ROOT}/.venv/bin/activate + +# ENV variables +echo setting env vars: +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 8 )) + +## master addr and port +# setting variables for torch.distributed +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export WORLD_SIZE=$GPUS +export OMP_NUM_THREADS=7 + +# Needed to bypass MIOpen, Disk I/O Errors +export MIOPEN_USER_DB_PATH="/tmp/my-miopen-cache-${SLURM_JOB_ID}" +export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} + +## some RCCL env variables +export FI_CXI_ATS=0 +export HSA_FORCE_FINE_GRAIN_PCIE=1 +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn0 +export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +# AWS-OFI-RCCL +export LD_LIBRARY_PATH=${APP_ROOT}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH +# other +export MPICH_GPU_SUPPORT_ENABLED=1 +export GPU_MAX_HW_QUEUES=1 +export OFI_NCCL_USE_IPV6_TCP=1 + +# deepcam setup +export RUN_TAG="${SLURM_JOB_NAME}-${SLURM_JOB_ID}" +BENCH_RCP_FIXED="\ + --gradient_accumulation_frequency 1 \ + --logging_frequency 10 \ + --save_frequency 0 \ + --seed $(date +%s) \ + --batchnorm_group_size 1 \ + --target_iou 0.80" +#BENCH_RCP_BASELINE_LR describes the learning rate for Baseline runs. +#It should not be modified. +BENCH_RCP_BASELINE_LR="\ + --start_lr 0.0055 \ + --lr_schedule type="multistep",milestones="800",decay_rate="0.1" \ + --lr_warmup_steps 400 \ + --lr_warmup_factor 1. \ + --weight_decay 1e-2 \ + --optimizer_betas 0.9 0.999" +BENCH_RCP_BASELINE="\ + ${BENCH_RCP_FIXED} \ + ${BENCH_RCP_BASELINE_LR}" + +# define command +MAX_EPOCHS=1 +cmd="srun --export=ALL --tasks-per-node=8 --gpus-per-node=8 \ + --gpu-bind=closest --gpus-per-task=1 \ + --cpu-bind=none --hint=nomultithread \ + python train.py \ + ${BENCH_RCP_BASELINE} \ + --data_dir_prefix ${APP_ROOT}/data/All-Hist \ + --run_tag ${RUN_TAG} \ + --output_dir ${JOB_OUTPUT_PATH} \ + --wireup_method nccl-slurm \ + --max_epochs ${MAX_EPOCHS} \ + --optimizer "Adam" \ + --local_batch_size 2" + +# run with profiler +export WITH_PROFILER=1 +OUTPUT_FILE="$JOB_OUTPUT_PATH/output-deepcam.log" +# clear cache +rm -rf ${MIOPEN_USER_DB_PATH} +mkdir -p ${MIOPEN_USER_DB_PATH} +# log start date +echo "start deepcam: $(date)" &>> $OUTPUT_FILE +# execute command +echo $cmd &>> $OUTPUT_FILE +eval $cmd &>> $OUTPUT_FILE +# log end date +echo "end deepcam: $(date)" &>> $OUTPUT_FILE + +rm -rf ${MIOPEN_USER_DB_PATH} + +# Run gpu benchmarks +COMM_TYPE=rccl +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH + +echo "end run: $(date)" \ No newline at end of file diff --git a/DeepCAM/run_frontier_64.sh b/DeepCAM/run_frontier_64.sh new file mode 100644 index 0000000..5c406fe --- /dev/null +++ b/DeepCAM/run_frontier_64.sh @@ -0,0 +1,132 @@ +#!/bin/bash +#SBATCH -N 64 +#SBATCH -n 512 +#SBATCH -q normal +#SBATCH -J deepcam +#SBATCH --gpu-bind none +#SBATCH -t 00:30:00 +#SBATCH -A csc569 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/64nodes/%x-%j/job-output.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/64nodes/%x-%j/job-error.log +#SBATCH --exclusive +# Run like: sbatch run_frontier_64.sh + +echo "start run: $(date)" +export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +OUTPUT_FILE=${JOB_OUTPUT_PATH}/output-deepcam.log +ERROR_FILE=${JOB_OUTPUT_PATH}/error-deepcam.log + +export SCRATCH="/lustre/orion/csc569/scratch/keshprad" +export APP_ROOT="${SCRATCH}/deepcam" +APP_WORKING_DIR=${APP_ROOT}/hpc/deepcam/src/deepCam +cd $APP_WORKING_DIR + +# load modules +ROCM_VERSION=6.1.3 +echo resetting modules: +module reset +echo loading modules: +module load PrgEnv-gnu/8.5.0 +module load rocm/6.1.3 +module load craype-accel-amd-gfx90a +module load cray-python/3.9.13.1 +module list + +# activate virtual env +echo activating virtual env: +source ${APP_ROOT}/.venv/bin/activate + +# ENV variables +echo setting env vars: +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 8 )) + +## master addr and port +# setting variables for torch.distributed +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export WORLD_SIZE=$GPUS +export OMP_NUM_THREADS=7 + +# Needed to bypass MIOpen, Disk I/O Errors +export MIOPEN_USER_DB_PATH="/tmp/my-miopen-cache-${SLURM_JOB_ID}" +export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} + +## some RCCL env variables +export FI_CXI_ATS=0 +export HSA_FORCE_FINE_GRAIN_PCIE=1 +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn0 +export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +# AWS-OFI-RCCL +export LD_LIBRARY_PATH=${APP_ROOT}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH +# other +export MPICH_GPU_SUPPORT_ENABLED=1 +export GPU_MAX_HW_QUEUES=1 +export OFI_NCCL_USE_IPV6_TCP=1 + +# deepcam setup +export RUN_TAG="${SLURM_JOB_NAME}-${SLURM_JOB_ID}" +BENCH_RCP_FIXED="\ + --gradient_accumulation_frequency 1 \ + --logging_frequency 10 \ + --save_frequency 0 \ + --seed $(date +%s) \ + --batchnorm_group_size 1 \ + --target_iou 0.80" +#BENCH_RCP_BASELINE_LR describes the learning rate for Baseline runs. +#It should not be modified. +BENCH_RCP_BASELINE_LR="\ + --start_lr 0.0055 \ + --lr_schedule type="multistep",milestones="800",decay_rate="0.1" \ + --lr_warmup_steps 400 \ + --lr_warmup_factor 1. \ + --weight_decay 1e-2 \ + --optimizer_betas 0.9 0.999" +BENCH_RCP_BASELINE="\ + ${BENCH_RCP_FIXED} \ + ${BENCH_RCP_BASELINE_LR}" + +# define command +MAX_EPOCHS=4 +cmd="srun --export=ALL --tasks-per-node=8 --gpus-per-node=8 \ + --gpu-bind=closest --gpus-per-task=1 \ + --cpu-bind=none --hint=nomultithread \ + python train.py \ + ${BENCH_RCP_BASELINE} \ + --data_dir_prefix ${APP_ROOT}/data/All-Hist \ + --run_tag ${RUN_TAG} \ + --output_dir ${JOB_OUTPUT_PATH} \ + --wireup_method nccl-slurm \ + --max_epochs ${MAX_EPOCHS} \ + --optimizer "Adam" \ + --local_batch_size 2" + +# run with profiler +export WITH_PROFILER=1 +OUTPUT_FILE="$JOB_OUTPUT_PATH/output-deepcam.log" +# clear cache +rm -rf ${MIOPEN_USER_DB_PATH} +mkdir -p ${MIOPEN_USER_DB_PATH} +# log start date +echo "start deepcam: $(date)" &>> $OUTPUT_FILE +# execute command +echo $cmd &>> $OUTPUT_FILE +eval $cmd &>> $OUTPUT_FILE +# log end date +echo "end deepcam: $(date)" &>> $OUTPUT_FILE + +rm -rf ${MIOPEN_USER_DB_PATH} + +# Run gpu benchmarks +COMM_TYPE=rccl +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH + +echo "end run: $(date)" \ No newline at end of file diff --git a/DeepCAM/run_frontier_crontab.sh b/DeepCAM/run_frontier_crontab.sh new file mode 100644 index 0000000..6d70161 --- /dev/null +++ b/DeepCAM/run_frontier_crontab.sh @@ -0,0 +1,19 @@ +#!/bin/bash +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi +# `16` or `64` +NUM_NODES=$1 + +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability + +# load lmod +source /usr/share/lmod/lmod/init/bash +# load default LMOD_SYSTEM_DEFAULT_MODULES and MODULEPATH +export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-trento:craype-network-ofi:perftools-base:xpmem:cray-pmi:PrgEnv-cray:DefApps +export MODULEPATH=/sw/frontier/spack-envs/modules/cce/17.0.0/cray-mpich-8.1.28/cce-17.0.0:/sw/frontier/spack-envs/modules/cce/17.0.0/cce-17.0.0:/sw/frontier/spack-envs/modules/Core/24.07:/opt/cray/pe/lmod/modulefiles/mpi/crayclang/17.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/crayclang/17.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/compiler/crayclang/17.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-trento/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/sw/frontier/modulefiles:/opt/cray/modulefiles + +# run sbatch script +script=$PERF_VARIABILITY_ROOT/DeepCAM/run_frontier_$NUM_NODES\.sh +sbatch $script \ No newline at end of file diff --git a/gpu-benchmarks/README.md b/gpu-benchmarks/README.md new file mode 100644 index 0000000..c8f9c25 --- /dev/null +++ b/gpu-benchmarks/README.md @@ -0,0 +1,14 @@ +# gpu-benchmarks README +Code Repository: [gpu-benchmarks](#TODO:) + +## Perlmutter Compilation + +### Steps to Compile + +TODO: + +## Frontier Compilation + +### Steps to Compile + +TODO: \ No newline at end of file diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh new file mode 100644 index 0000000..7fc10b4 --- /dev/null +++ b/gpu-benchmarks/allgather/run_frontier.sh @@ -0,0 +1,63 @@ +# This script assumes it is being run by another sbatch script, +# so does not include portions for SBATCH vars (e.g. account, time, etc.) + +# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allgather.sh + +#!/bin/bash +if [ "$#" -ne 4 ]; then + echo "Usage: $0 " + exit 1 +fi +# `mpi` or `rccl` +COMM_TYPE=$1 +# `5.7.1` or `6.1.3` +ROCM_VERSION=$2 +# `16` or `64` +NUM_NODES=$3 +# output directory +OUTPUT_DIR=$4 + +# setup cray-mpich version +if [[ "$ROCM_VERSION" == "6.1.3" ]]; then + MPICH_VERSION=8.1.30 +else + MPICH_VERSION=8.1.28 +fi + +OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log + +{ + # reset modules + echo resetting modules: + module reset + # load modules + echo loading modules: + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION} + module load cray-mpich/${MPICH_VERSION} + module load rocm/${ROCM_VERSION} + module list + + GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks + EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE\_rocm-${ROCM_VERSION}.x + NUM_TASKS=$(($NUM_NODES * 8)) + MIN_MSG_SIZE=$((1 * 1024)) + MAX_MSG_SIZE=$((1 * 1024 * 1024)) + ITERATIONS=100 + + export MPICH_GPU_SUPPORT_ENABLED=1 + export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}" + + echo start allgather: $(date) + For MPI-bench we should use --gpus-per-node --gpus-per-task --ntasks-per-node , and --gpu-bind=none in srun. + CMD="srun -N $NUM_NODES -n $NUM_TASKS \ + --gpus-per-node 8 \ + --gpus-per-task 1 \ + --ntasks-per-node 8 \ + --gpu-bind none \ + --output $OUTPUT_FILE \ + $EXEC $NUM_TASKS $MIN_MSG_SIZE $MAX_MSG_SIZE $ITERATIONS" + echo running: + echo $CMD + $CMD + echo end allgather: $(date) +} &>> $OUTPUT_FILE diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh new file mode 100644 index 0000000..855a486 --- /dev/null +++ b/gpu-benchmarks/allreduce/run_frontier.sh @@ -0,0 +1,58 @@ +# This script assumes it is being run by another sbatch script, +# so does not include portions for SBATCH vars (e.g. account, time, etc.) + +# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allreduce.sh + +#!/bin/bash +if [ "$#" -ne 4 ]; then + echo "Usage: $0 " + exit 1 +fi +# `mpi` or `rccl` +COMM_TYPE=$1 +# `5.7.1` or `6.1.3` +ROCM_VERSION=$2 +# `16` or `64` +NUM_NODES=$3 +# output directory +OUTPUT_DIR=$4 + +# setup cray-mpich version +if [[ "$ROCM_VERSION" == "6.1.3" ]]; then + MPICH_VERSION=8.1.30 +else + MPICH_VERSION=8.1.28 +fi + +OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log + +{ + # reset modules + echo resetting modules: + module reset + # load modules + echo loading modules: + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION} + module load cray-mpich/${MPICH_VERSION} + module load rocm/${ROCM_VERSION} + module list + + GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks + EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE\_rocm-${ROCM_VERSION}.x + NUM_TASKS=$(($NUM_NODES * 8)) + MIN_MSG_SIZE=$((1 * 1024)) + MAX_MSG_SIZE=$((1 * 1024 * 1024)) + ITERATIONS=100 + + export MPICH_GPU_SUPPORT_ENABLED=1 + export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}" + + echo start allreduce: $(date) + CMD="srun -N $NUM_NODES -n $NUM_TASKS \ + --output $OUTPUT_FILE \ + $EXEC $NUM_TASKS $MIN_MSG_SIZE $MAX_MSG_SIZE $ITERATIONS" + echo running: + echo $CMD + $CMD + echo end allreduce: $(date) +} &>> $OUTPUT_FILE diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh new file mode 100644 index 0000000..c5348be --- /dev/null +++ b/gpu-benchmarks/gemm/run_frontier.sh @@ -0,0 +1,56 @@ +# This script assumes it is being run by another sbatch script, +# so does not include portions for SBATCH vars (e.g. account, time, etc.) + +# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/gemm.sh + +#!/bin/bash +if [ "$#" -ne 3 ]; then + echo "Usage: $0 " + exit 1 +fi +# `5.7.1` or `6.1.3` +ROCM_VERSION=$1 +# `16` or `64` +NUM_NODES=$2 +# output directory +OUTPUT_DIR=$3 + +# setup cray-mpich version +if [[ "$ROCM_VERSION" == "6.1.3" ]]; then + MPICH_VERSION=8.1.30 +else + MPICH_VERSION=8.1.28 +fi + +OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log + +{ + # reset modules + echo resetting modules: + module reset + # load modules + echo loading modules: + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION} + module load cray-mpich/${MPICH_VERSION} + module load rocm/${ROCM_VERSION} + module list + + GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks + EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm_rocm-${ROCM_VERSION}.x + NUM_TASKS=$(($NUM_NODES * 8)) + + export MPICH_GPU_SUPPORT_ENABLED=1 + export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}" + + echo start gemm: $(date) + CMD="srun -N $NUM_NODES -n $NUM_TASKS \ + --gpus-per-node 8 \ + --gpus-per-task 1 \ + --ntasks-per-node 8 \ + --output $OUTPUT_FILE \ + $EXEC" + echo running: + echo $CMD + $CMD + echo end gemm: $(date) +} &>> $OUTPUT_FILE diff --git a/nanoGPT/README.md b/nanoGPT/README.md index 5c499fc..87e8189 100644 --- a/nanoGPT/README.md +++ b/nanoGPT/README.md @@ -1,33 +1,62 @@ -# nanoGPT Setup Instructions +# nanoGPT README +For more detailed installation parameters, please refer to [nanoGPT install guide](https://github.com/axonn-ai/nanoGPT). -## Clone the Repository +Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/) -```sh -git clone https://github.com/axonn-ai/nanoGPT.git -``` -## Create Python Environment +## Perlmutter Setup -```sh -./scripts/create_python_env_perlmutter.sh -``` +### Setup steps -> Note: You may need to modify the path and torch version in `create_python_env_perlmutter.sh`. +1. Clone the Repository + ```sh + git clone https://github.com/axonn-ai/nanoGPT.git + cd nanoGPT + ``` -## Load PyTorch Module +2. Create Python Environment + ```sh + ./scripts/create_python_env_perlmutter.sh + ``` + > Note: You may need to modify the path and torch version in `create_python_env_perlmutter.sh`. -```sh -module load pytorch/2.0.1 -``` +3. Load PyTorch Module + ```sh + module load pytorch/2.0.1 + ``` -## Activate the Environment +4. Activate the Environment + ```sh + source path_to_nanogptENV/bin/activate + ``` -```sh -source path_to_nanogptENV/bin/activate -``` +5. Download Data + ```sh + python nanoGPT/data/openwebtext/prepare.py + ``` -## Download Data +## Frontier Setup -```sh -python nanoGPT/data/openwebtext/prepare.py -``` \ No newline at end of file +### Setup steps + +1. Clone the Repository + ```sh + git clone https://github.com/axonn-ai/nanoGPT.git + cd nanoGPT + ``` + +2. Create Python Environment + ```sh + ./scripts/create_python_env_frontier.sh + ``` + > Note: You may need to modify the WKSPC path and torch version in `create_python_env_frontier.sh`. + +4. Activate the Environment + ```sh + source path_to_nanogptENV/bin/activate + ``` + +5. Download Data + ```sh + python data/openwebtext/prepare.py + ``` \ No newline at end of file diff --git a/nanoGPT/run_frontier16.sh b/nanoGPT/run_frontier16.sh new file mode 100644 index 0000000..901561e --- /dev/null +++ b/nanoGPT/run_frontier16.sh @@ -0,0 +1,86 @@ +#!/bin/bash +#SBATCH -N 16 +#SBATCH -n 128 +#SBATCH -q normal +#SBATCH -J nanogpt +#SBATCH --gpu-bind none +#SBATCH -t 00:30:00 +#SBATCH -A csc569 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-output.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-error.log +#SBATCH --exclusive +# Run like: sbatch run_frontier16.sh + +echo "start run: $(date)" +export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log +ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log + +export SCRATCH="/lustre/orion/csc569/scratch/keshprad" +export WRKSPC="${SCRATCH}/nanoGPT" +export HF_HOME="${SCRATCH}/.cache/hf" +export HF_TRANSFORMERS_CACHE="${HF_HOME}" +export HF_DATASETS_CACHE="${HF_HOME}/datasets" +cd $WRKSPC + +# load modules +ROCM_VERSION=6.1.3 +echo resetting modules: +module reset +echo loading modules: +module load PrgEnv-gnu/8.5.0 +module load rocm/${ROCM_VERSION} +module load craype-accel-amd-gfx90a +module load cray-python/3.9.13.1 +module load cray-mpich/8.1.30 +module list +# activate env +source ${WRKSPC}/axonn_nanogpt/bin/activate + +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 8 )) +## master addr and port +# setting variables for torch.distributed +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export WORLD_SIZE=$GPUS +export OMP_NUM_THREADS=7 + +## some RCCL env variables +export FI_CXI_ATS=0 +export HSA_FORCE_FINE_GRAIN_PCIE=1 +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn0 +export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +# AWS-OFI-RCCL +export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH" +# other +export MPICH_GPU_SUPPORT_ENABLED=1 +export GPU_MAX_HW_QUEUES=1 +export OFI_NCCL_USE_IPV6_TCP=1 + +SCRIPT="train_frontier.py config/train_gpt_neox_5B.py" + +# run with profiler +export WITH_PROFILER=1 +OUTPUT_FILE="$JOB_OUTPUT_PATH/output-nanoGPT.log" +# log start date +echo "start nanoGPT: $(date)" &>> $OUTPUT_FILE +run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT" +echo $run_cmd &>> $OUTPUT_FILE +eval $run_cmd &>> $OUTPUT_FILE +# log end date +echo "end nanoGPT: $(date)" &>> $OUTPUT_FILE + +# Run gpu benchmarks +COMM_TYPE=rccl +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH + +echo "end run: $(date)" \ No newline at end of file diff --git a/nanoGPT/run_frontier64.sh b/nanoGPT/run_frontier64.sh new file mode 100644 index 0000000..3201b51 --- /dev/null +++ b/nanoGPT/run_frontier64.sh @@ -0,0 +1,86 @@ +#!/bin/bash +#SBATCH -N 64 +#SBATCH -n 512 +#SBATCH -q normal +#SBATCH -J nanogpt +#SBATCH --gpu-bind none +#SBATCH -t 00:30:00 +#SBATCH -A csc569 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-output.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-error.log +#SBATCH --exclusive +# Run like: sbatch run_frontier64.sh + +echo "start run: $(date)" +export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log +ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log + +export SCRATCH="/lustre/orion/csc569/scratch/keshprad" +export WRKSPC="${SCRATCH}/nanoGPT" +export HF_HOME="${SCRATCH}/.cache/hf" +export HF_TRANSFORMERS_CACHE="${HF_HOME}" +export HF_DATASETS_CACHE="${HF_HOME}/datasets" +cd $WRKSPC + +# load modules +ROCM_VERSION=6.1.3 +echo resetting modules: +module reset +echo loading modules: +module load PrgEnv-gnu/8.5.0 +module load rocm/${ROCM_VERSION} +module load craype-accel-amd-gfx90a +module load cray-python/3.9.13.1 +module load cray-mpich/8.1.30 +module list +# activate env +source ${WRKSPC}/axonn_nanogpt/bin/activate + +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 8 )) +## master addr and port +# setting variables for torch.distributed +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export WORLD_SIZE=$GPUS +export OMP_NUM_THREADS=7 + +## some RCCL env variables +export FI_CXI_ATS=0 +export HSA_FORCE_FINE_GRAIN_PCIE=1 +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn0 +export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +# AWS-OFI-RCCL +export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH" +# other +export MPICH_GPU_SUPPORT_ENABLED=1 +export GPU_MAX_HW_QUEUES=1 +export OFI_NCCL_USE_IPV6_TCP=1 + +SCRIPT="train_frontier.py config/train_gpt_neox_20B.py" + +# run with profiler +export WITH_PROFILER=1 +OUTPUT_FILE="$JOB_OUTPUT_PATH/output-nanoGPT.log" +# log start date +echo "start nanoGPT: $(date)" &>> $OUTPUT_FILE +run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT" +echo $run_cmd &>> $OUTPUT_FILE +eval $run_cmd &>> $OUTPUT_FILE +# log end date +echo "end nanoGPT: $(date)" &>> $OUTPUT_FILE + +# Run gpu benchmarks +COMM_TYPE=rccl +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH + +echo "end run: $(date)" \ No newline at end of file diff --git a/nanoGPT/run_frontier_crontab.sh b/nanoGPT/run_frontier_crontab.sh new file mode 100644 index 0000000..dcc8cf5 --- /dev/null +++ b/nanoGPT/run_frontier_crontab.sh @@ -0,0 +1,19 @@ +#!/bin/bash +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi +# `16` or `64` +NUM_NODES=$1 + +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability + +# load lmod +source /usr/share/lmod/lmod/init/bash +# load default LMOD_SYSTEM_DEFAULT_MODULES and MODULEPATH +export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-trento:craype-network-ofi:perftools-base:xpmem:cray-pmi:PrgEnv-cray:DefApps +export MODULEPATH=/sw/frontier/spack-envs/modules/cce/17.0.0/cray-mpich-8.1.28/cce-17.0.0:/sw/frontier/spack-envs/modules/cce/17.0.0/cce-17.0.0:/sw/frontier/spack-envs/modules/Core/24.07:/opt/cray/pe/lmod/modulefiles/mpi/crayclang/17.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/crayclang/17.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/compiler/crayclang/17.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-trento/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/sw/frontier/modulefiles:/opt/cray/modulefiles + +# run sbatch script +script=$PERF_VARIABILITY_ROOT/nanoGPT/run_frontier$NUM_NODES\.sh +sbatch $script \ No newline at end of file diff --git a/nanoGPT/train_gpt_neox_20B_frontier.py b/nanoGPT/train_gpt_neox_20B_frontier.py new file mode 100644 index 0000000..cf7b91f --- /dev/null +++ b/nanoGPT/train_gpt_neox_20B_frontier.py @@ -0,0 +1,46 @@ +# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB +# launch as the following (e.g. in a screen session) and wait ~5 days: +# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py + +wandb_log = False +wandb_project = 'owt' +wandb_run_name='gpt2-124M' + +# these make the total batch size be ~0.5M +# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520 +batch_size = 8 +block_size = 512 +gradient_accumulation_steps = 1 * 512 #per_gpu x num_gpus + +# model +n_layer = 32 +n_head = 56 +n_embd = 7168 +dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+ +bias = False # do we use bias inside LayerNorm and Linear layers? + +# adamw optimizer +learning_rate = 1e-4 # max learning rate +max_iters = 30 # total number of training iterations + +# axonn params +G_intra_d=16 +G_intra_c=1 +G_intra_r=1 +compile=False # disable compile for axonn +gradient_checkpointing=True + +# this makes total number of tokens be 300B +max_iters = 30 +lr_decay_iters = 600000 + +# eval stuff +eval_interval = 1000 +eval_iters = 1 +log_interval = 10 + +# weight decay +weight_decay = 1e-1 + +# log every iteration +log_interval=1 \ No newline at end of file diff --git a/nanoGPT/train_gpt_neox_5B_frontier.py b/nanoGPT/train_gpt_neox_5B_frontier.py new file mode 100644 index 0000000..4ce7b55 --- /dev/null +++ b/nanoGPT/train_gpt_neox_5B_frontier.py @@ -0,0 +1,46 @@ +# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB +# launch as the following (e.g. in a screen session) and wait ~5 days: +# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py + +wandb_log = False +wandb_project = 'owt' +wandb_run_name='gpt2-124M' + +# these make the total batch size be ~0.5M +# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520 +batch_size = 16 +block_size = 512 +gradient_accumulation_steps = 2 * 128 #per_gpu x num_gpus + +# model +n_layer = 24 +n_head = 32 +n_embd = 4096 +dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+ +bias = False # do we use bias inside LayerNorm and Linear layers? + +# adamw optimizer +learning_rate = 1e-4 # max learning rate +max_iters = 30 # total number of training iterations + +# axonn params +G_intra_d=16 +G_intra_c=1 +G_intra_r=1 +compile=False # disable compile for axonn +gradient_checkpointing=True + +# this makes total number of tokens be 300B +max_iters = 30 +lr_decay_iters = 600000 + +# eval stuff +eval_interval = 1000 +eval_iters = 1 +log_interval = 10 + +# weight decay +weight_decay = 1e-1 + +# log every iteration +log_interval=1 \ No newline at end of file