From ca42dbd33517c93b924609833ce4d92fc32b7ad2 Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Wed, 4 Dec 2024 14:46:33 -0500 Subject: [PATCH 1/9] add frontier install instrs for AMG2023 --- AMG2023/README.md | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/AMG2023/README.md b/AMG2023/README.md index 476ad56..71af9f6 100644 --- a/AMG2023/README.md +++ b/AMG2023/README.md @@ -51,4 +51,47 @@ Repository: [AMG2023](https://github.com/pssg-int/AMG2023) ``` ## Frontier Installation +1. Load modules + ```sh + module reset + + module load cray-mpich/8.1.28 + module load craype-accel-amd-gfx90a + module load rocm + export MPICH_GPU_SUPPORT_ENABLED=1 + + # load compatible cmake version + module load Core/24.07 + module load cmake/3.27.9 + ``` +2. Configure hypre + - Clone hypre v2.27.0 and navigate to src: + ```sh + git clone -b v2.27.0 https://github.com/hypre-space/hypre.git + cd into ~/hypre/src + ``` + - Configure hypre (in hypre/src) + ```sh + ./configure --with-hip --with-gpu-arch=gfx90a --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" --with-MPI-include="${MPICH_DIR}/include" + ``` + - Compile hypre (in hypre/src) + ```sh + # build with make + make + ``` +3. Configure AMG2023 + - Clone repo: + ```sh + git clone https://github.com/pssg-int/AMG2023` + cd AMG2023 + ``` + - Configure cmake + ```sh + mkdir build && cd build + cmake .. -DHYPRE_PREFIX=/ccs/home/keshprad/hypre/src/hypre/ -DCMAKE_EXE_LINKER_FLAGS="-lrocsparse -lrocrand" + ``` + - Compile AMG2023 (in AMG2023/build) + ```sh + make install + ``` From 28abeca38d9f9396e89d5ef2091720a371f5f4f8 Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Tue, 17 Dec 2024 18:06:04 -0500 Subject: [PATCH 2/9] frontier scripts for AMG and for gpu benchmarks --- AMG2023/run_frontier_16.sh | 56 ++++++++++++++++++++++++ AMG2023/run_frontier_64.sh | 56 ++++++++++++++++++++++++ AMG2023/run_frontier_crontab.sh | 19 ++++++++ gpu-benchmarks/allgather/run_frontier.sh | 51 +++++++++++++++++++++ gpu-benchmarks/allreduce/run_frontier.sh | 46 +++++++++++++++++++ gpu-benchmarks/gemm/run_frontier.sh | 44 +++++++++++++++++++ 6 files changed, 272 insertions(+) create mode 100644 AMG2023/run_frontier_16.sh create mode 100644 AMG2023/run_frontier_64.sh create mode 100644 AMG2023/run_frontier_crontab.sh create mode 100644 gpu-benchmarks/allgather/run_frontier.sh create mode 100644 gpu-benchmarks/allreduce/run_frontier.sh create mode 100644 gpu-benchmarks/gemm/run_frontier.sh diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh new file mode 100644 index 0000000..8546887 --- /dev/null +++ b/AMG2023/run_frontier_16.sh @@ -0,0 +1,56 @@ +#!/bin/bash +#SBATCH -N 16 +#SBATCH -n 128 +#SBATCH -q normal +#SBATCH -J amg +#SBATCH -t 00:30:00 +#SBATCH -A csc569 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/%x-%j/output-AMG2023.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/%x-%j/error-AMG2023.log +#SBATCH --exclusive +# Run like: sbatch run_frontier_16.sh + +OUTPUT_DIR=/lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +OUTPUT_FILE=$OUTPUT_DIR/output-AMG2023.log +ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log + +# Run gpu benchmarks +COMM_TYPE=mpi +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $OUTPUT_DIR + +APP_ROOT=/ccs/home/keshprad/AMG2023 +cd $APP_ROOT + +# reset modules +echo resetting modules: +module reset +# load modules +echo loading modules: +module load cray-mpich/8.1.28 +module load craype-accel-amd-gfx90a +module load rocm + +export MPICH_GPU_SUPPORT_ENABLED=1 +export CRAY_ACCEL_TARGET=gfx90a +export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ +export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so +export MPIP="-f $OUTPUT_DIR" + +# log start date +echo start AMG2023: $(date) +# define command +cmd="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \ + --output $OUTPUT_FILE \ + --error $ERROR_FILE \ + ./build/amg -P 4 4 8 -n 128 64 64 -problem 1 -iter 500" +echo solving: +echo $cmd +$cmd +# log end date +echo end AMG2023: $(date) diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh new file mode 100644 index 0000000..c28de6a --- /dev/null +++ b/AMG2023/run_frontier_64.sh @@ -0,0 +1,56 @@ +#!/bin/bash +#SBATCH -N 64 +#SBATCH -n 512 +#SBATCH -q normal +#SBATCH -J amg +#SBATCH -t 00:30:00 +#SBATCH -A csc569 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/%x-%j/output-AMG2023.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/%x-%j/error-AMG2023.log +#SBATCH --exclusive +# Run like: sbatch run_frontier_64.sh + +OUTPUT_DIR=/lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +OUTPUT_FILE=$OUTPUT_DIR/output-AMG2023.log +ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log + +# Run gpu benchmarks +COMM_TYPE=mpi +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $OUTPUT_DIR + +APP_ROOT=/ccs/home/keshprad/AMG2023 +cd $APP_ROOT + +# reset modules +echo resetting modules: +module reset +# load modules +echo loading modules: +module load cray-mpich/8.1.28 +module load craype-accel-amd-gfx90a +module load rocm + +export MPICH_GPU_SUPPORT_ENABLED=1 +export CRAY_ACCEL_TARGET=gfx90a +export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ +export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so +export MPIP="-f $OUTPUT_DIR" + +# log start date +echo start AMG2023: $(date) +# define command +cmd="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \ + --output $OUTPUT_FILE \ + --error $ERROR_FILE \ + ./build/amg -P 8 8 8 -n 128 64 64 -problem 1 -iter 500" +echo solving: +echo $cmd +$cmd +# log end date +echo end AMG2023: $(date) diff --git a/AMG2023/run_frontier_crontab.sh b/AMG2023/run_frontier_crontab.sh new file mode 100644 index 0000000..09b0f66 --- /dev/null +++ b/AMG2023/run_frontier_crontab.sh @@ -0,0 +1,19 @@ +#!/bin/bash +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi +# `16` or `64` +NUM_NODES=$1 + +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability + +# load lmod +source /usr/share/lmod/lmod/init/bash +# load default LMOD_SYSTEM_DEFAULT_MODULES and MODULEPATH +export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-trento:craype-network-ofi:perftools-base:xpmem:cray-pmi:PrgEnv-cray:DefApps +export MODULEPATH=/sw/frontier/spack-envs/modules/cce/17.0.0/cray-mpich-8.1.28/cce-17.0.0:/sw/frontier/spack-envs/modules/cce/17.0.0/cce-17.0.0:/sw/frontier/spack-envs/modules/Core/24.07:/opt/cray/pe/lmod/modulefiles/mpi/crayclang/17.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/crayclang/17.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/compiler/crayclang/17.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-trento/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/sw/frontier/modulefiles:/opt/cray/modulefiles + +# run sbatch script +script=$PERF_VARIABILITY_ROOT/AMG2023/run_frontier_$NUM_NODES\.sh +sbatch $script \ No newline at end of file diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh new file mode 100644 index 0000000..dfd7bfe --- /dev/null +++ b/gpu-benchmarks/allgather/run_frontier.sh @@ -0,0 +1,51 @@ +# This script assumes it is being run by another sbatch script, +# so does not include portions for SBATCH vars (e.g. account, time, etc.) + +# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allgather.sh + +#!/bin/bash +if [ "$#" -ne 3 ]; then + echo "Usage: $0 " + exit 1 +fi +# `mpi` or `rccl` +COMM_TYPE=$1 +# `16` or `64` +NUM_NODES=$2 +# output directory +OUTPUT_DIR=$3 + +OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log + +{ + # reset modules + echo resetting modules: + module reset + # load modules + echo loading modules: + module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm + + GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks + EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x + NUM_TASKS=$(($NUM_NODES * 8)) + MIN_MSG_SIZE=$((1 * 1024)) + MAX_MSG_SIZE=$((1 * 1024 * 1024)) + ITERATIONS=100 + + export MPICH_GPU_SUPPORT_ENABLED=1 + export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}" + + echo start allgather: $(date) + For MPI-bench we should use --gpus-per-node --gpus-per-task --ntasks-per-node , and --gpu-bind=none in srun. + CMD="srun -N $NUM_NODES -n $NUM_TASKS \ + --gpus-per-node 8 \ + --gpus-per-task 1 \ + --ntasks-per-node 8 \ + --gpu-bind none \ + --output $OUTPUT_FILE \ + $EXEC $NUM_TASKS $MIN_MSG_SIZE $MAX_MSG_SIZE $ITERATIONS" + echo running: + echo $CMD + $CMD + echo end allgather: $(date) +} >> $OUTPUT_FILE diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh new file mode 100644 index 0000000..caafc1a --- /dev/null +++ b/gpu-benchmarks/allreduce/run_frontier.sh @@ -0,0 +1,46 @@ +# This script assumes it is being run by another sbatch script, +# so does not include portions for SBATCH vars (e.g. account, time, etc.) + +# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allreduce.sh + +#!/bin/bash +if [ "$#" -ne 3 ]; then + echo "Usage: $0 " + exit 1 +fi +# `mpi` or `rccl` +COMM_TYPE=$1 +# `16` or `64` +NUM_NODES=$2 +# output directory +OUTPUT_DIR=$3 + +OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log + +{ + # reset modules + echo resetting modules: + module reset + # load modules + echo loading modules: + module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm + + GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks + EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x + NUM_TASKS=$(($NUM_NODES * 8)) + MIN_MSG_SIZE=$((1 * 1024)) + MAX_MSG_SIZE=$((1 * 1024 * 1024)) + ITERATIONS=100 + + export MPICH_GPU_SUPPORT_ENABLED=1 + export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}" + + echo start allreduce: $(date) + CMD="srun -N $NUM_NODES -n $NUM_TASKS \ + --output $OUTPUT_FILE \ + $EXEC $NUM_TASKS $MIN_MSG_SIZE $MAX_MSG_SIZE $ITERATIONS" + echo running: + echo $CMD + $CMD + echo end allreduce: $(date) +} >> $OUTPUT_FILE diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh new file mode 100644 index 0000000..6f9bb5b --- /dev/null +++ b/gpu-benchmarks/gemm/run_frontier.sh @@ -0,0 +1,44 @@ +# This script assumes it is being run by another sbatch script, +# so does not include portions for SBATCH vars (e.g. account, time, etc.) + +# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/gemm.sh + +#!/bin/bash +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi +# `16` or `64` +NUM_NODES=$1 +# output directory +OUTPUT_DIR=$2 + +OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log + +{ + # reset modules + echo resetting modules: + module reset + # load modules + echo loading modules: + module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm + + GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks + EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x + NUM_TASKS=$(($NUM_NODES * 8)) + + export MPICH_GPU_SUPPORT_ENABLED=1 + export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}" + + echo start gemm: $(date) + CMD="srun -N $NUM_NODES -n $NUM_TASKS \ + --gpus-per-node 8 \ + --gpus-per-task 1 \ + --ntasks-per-node 8 \ + --output $OUTPUT_FILE \ + $EXEC" + echo running: + echo $CMD + $CMD + echo end gemm: $(date) +} >> $OUTPUT_FILE From d59c821dd5b11603f99e984df12cdb7cf00f8c24 Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Wed, 18 Dec 2024 02:18:39 -0500 Subject: [PATCH 3/9] reformat readme --- AMG2023/README.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/AMG2023/README.md b/AMG2023/README.md index 71af9f6..3e9b90e 100644 --- a/AMG2023/README.md +++ b/AMG2023/README.md @@ -1,9 +1,9 @@ # AMG2023 README For more detailed installation parameters, please refer to the [installation document](https://github.com/pssg-int/AMG2023/blob/main/amg-doc.pdf). -## Perlmutter Compilation +Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/) -Repository: [AMG2023](https://github.com/pssg-int/AMG2023) +## Perlmutter Compilation ### Steps to Compile @@ -50,7 +50,10 @@ Repository: [AMG2023](https://github.com/pssg-int/AMG2023) cmake -DHYPRE_PREFIX=/pscratch/sd/c/cunyang/AMG2023 .. ``` -## Frontier Installation +## Frontier Compilation + +### Steps to Compile + 1. Load modules ```sh module reset From c76505da4eeba3de36130bf3345d88cd4236aaad Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Wed, 25 Dec 2024 01:05:56 -0500 Subject: [PATCH 4/9] update AMG2023 and gpu-benchmarks scripts to use newest rocm and cray-mpich versions available on frontier --- AMG2023/README.md | 15 +++++++++------ AMG2023/run_frontier_16.sh | 11 ++++------- AMG2023/run_frontier_64.sh | 10 ++++------ gpu-benchmarks/allgather/run_frontier.sh | 4 +++- gpu-benchmarks/allreduce/run_frontier.sh | 4 +++- gpu-benchmarks/gemm/run_frontier.sh | 4 +++- 6 files changed, 26 insertions(+), 22 deletions(-) diff --git a/AMG2023/README.md b/AMG2023/README.md index 3e9b90e..03832f1 100644 --- a/AMG2023/README.md +++ b/AMG2023/README.md @@ -58,24 +58,27 @@ Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/) ```sh module reset - module load cray-mpich/8.1.28 + module load cray-mpich/8.1.30 module load craype-accel-amd-gfx90a - module load rocm + module load rocm/6.2.4 export MPICH_GPU_SUPPORT_ENABLED=1 # load compatible cmake version module load Core/24.07 module load cmake/3.27.9 ``` -2. Configure hypre - - Clone hypre v2.27.0 and navigate to src: +2. Configure hypre (v2.32.0) + - Clone hypre v2.32.0 and navigate to src: ```sh - git clone -b v2.27.0 https://github.com/hypre-space/hypre.git + git clone -b v2.32.0 https://github.com/hypre-space/hypre.git cd into ~/hypre/src ``` - Configure hypre (in hypre/src) ```sh - ./configure --with-hip --with-gpu-arch=gfx90a --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" --with-MPI-include="${MPICH_DIR}/include" + ./configure --with-hip --enable-device-memory-pool --enable-mixedint --with-gpu-arch=gfx90a \ + --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" \ + --with-MPI-include="${MPICH_DIR}/include" \ + --with-extra-CUFLAGS="-I/opt/rocm-6.2.4/include -I/opt/rocm-6.2.4/include/rocsparse -L/opt/rocm-6.2.4/lib" ``` - Compile hypre (in hypre/src) ```sh diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh index 8546887..92664c3 100644 --- a/AMG2023/run_frontier_16.sh +++ b/AMG2023/run_frontier_16.sh @@ -32,22 +32,19 @@ echo resetting modules: module reset # load modules echo loading modules: -module load cray-mpich/8.1.28 +module load cray-mpich/8.1.30 module load craype-accel-amd-gfx90a -module load rocm +module load rocm/6.2.4 export MPICH_GPU_SUPPORT_ENABLED=1 export CRAY_ACCEL_TARGET=gfx90a export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ -export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so -export MPIP="-f $OUTPUT_DIR" +export MPIP="-o -f $OUTPUT_DIR" # log start date echo start AMG2023: $(date) # define command -cmd="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \ - --output $OUTPUT_FILE \ - --error $ERROR_FILE \ +cmd="srun --output $OUTPUT_FILE --error $ERROR_FILE \ ./build/amg -P 4 4 8 -n 128 64 64 -problem 1 -iter 500" echo solving: echo $cmd diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh index c28de6a..eb4c6d9 100644 --- a/AMG2023/run_frontier_64.sh +++ b/AMG2023/run_frontier_64.sh @@ -32,22 +32,20 @@ echo resetting modules: module reset # load modules echo loading modules: -module load cray-mpich/8.1.28 +module load cray-mpich/8.1.30 module load craype-accel-amd-gfx90a -module load rocm +module load rocm/6.2.4 export MPICH_GPU_SUPPORT_ENABLED=1 export CRAY_ACCEL_TARGET=gfx90a export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so -export MPIP="-f $OUTPUT_DIR" +export MPIP="-o -f $OUTPUT_DIR" # log start date echo start AMG2023: $(date) # define command -cmd="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \ - --output $OUTPUT_FILE \ - --error $ERROR_FILE \ +cmd="srun --output $OUTPUT_FILE --error $ERROR_FILE \ ./build/amg -P 8 8 8 -n 128 64 64 -problem 1 -iter 500" echo solving: echo $cmd diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh index dfd7bfe..cb98dd6 100644 --- a/gpu-benchmarks/allgather/run_frontier.sh +++ b/gpu-benchmarks/allgather/run_frontier.sh @@ -23,7 +23,9 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log module reset # load modules echo loading modules: - module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4 + module load cray-mpich/8.1.30 + module load rocm/6.2.4 GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh index caafc1a..5ac70ea 100644 --- a/gpu-benchmarks/allreduce/run_frontier.sh +++ b/gpu-benchmarks/allreduce/run_frontier.sh @@ -23,7 +23,9 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log module reset # load modules echo loading modules: - module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4 + module load cray-mpich/8.1.30 + module load rocm/6.2.4 GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh index 6f9bb5b..4ffd5e8 100644 --- a/gpu-benchmarks/gemm/run_frontier.sh +++ b/gpu-benchmarks/gemm/run_frontier.sh @@ -21,7 +21,9 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log module reset # load modules echo loading modules: - module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4 + module load cray-mpich/8.1.30 + module load rocm/6.2.4 GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x From 3d75c0d02d51fa80cf1888a84311c8cb3e5c2a3d Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Wed, 25 Dec 2024 23:54:08 -0500 Subject: [PATCH 5/9] nanogpt scripts --- nanoGPT/README.md | 73 ++++++++++++++------- nanoGPT/run_frontier16.sh | 90 ++++++++++++++++++++++++++ nanoGPT/run_frontier64.sh | 90 ++++++++++++++++++++++++++ nanoGPT/run_frontier_crontab.sh | 19 ++++++ nanoGPT/train_gpt_neox_20B_frontier.py | 46 +++++++++++++ nanoGPT/train_gpt_neox_5B_frontier.py | 46 +++++++++++++ 6 files changed, 342 insertions(+), 22 deletions(-) create mode 100644 nanoGPT/run_frontier16.sh create mode 100644 nanoGPT/run_frontier64.sh create mode 100644 nanoGPT/run_frontier_crontab.sh create mode 100644 nanoGPT/train_gpt_neox_20B_frontier.py create mode 100644 nanoGPT/train_gpt_neox_5B_frontier.py diff --git a/nanoGPT/README.md b/nanoGPT/README.md index 5c499fc..87e8189 100644 --- a/nanoGPT/README.md +++ b/nanoGPT/README.md @@ -1,33 +1,62 @@ -# nanoGPT Setup Instructions +# nanoGPT README +For more detailed installation parameters, please refer to [nanoGPT install guide](https://github.com/axonn-ai/nanoGPT). -## Clone the Repository +Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/) -```sh -git clone https://github.com/axonn-ai/nanoGPT.git -``` -## Create Python Environment +## Perlmutter Setup -```sh -./scripts/create_python_env_perlmutter.sh -``` +### Setup steps -> Note: You may need to modify the path and torch version in `create_python_env_perlmutter.sh`. +1. Clone the Repository + ```sh + git clone https://github.com/axonn-ai/nanoGPT.git + cd nanoGPT + ``` -## Load PyTorch Module +2. Create Python Environment + ```sh + ./scripts/create_python_env_perlmutter.sh + ``` + > Note: You may need to modify the path and torch version in `create_python_env_perlmutter.sh`. -```sh -module load pytorch/2.0.1 -``` +3. Load PyTorch Module + ```sh + module load pytorch/2.0.1 + ``` -## Activate the Environment +4. Activate the Environment + ```sh + source path_to_nanogptENV/bin/activate + ``` -```sh -source path_to_nanogptENV/bin/activate -``` +5. Download Data + ```sh + python nanoGPT/data/openwebtext/prepare.py + ``` -## Download Data +## Frontier Setup -```sh -python nanoGPT/data/openwebtext/prepare.py -``` \ No newline at end of file +### Setup steps + +1. Clone the Repository + ```sh + git clone https://github.com/axonn-ai/nanoGPT.git + cd nanoGPT + ``` + +2. Create Python Environment + ```sh + ./scripts/create_python_env_frontier.sh + ``` + > Note: You may need to modify the WKSPC path and torch version in `create_python_env_frontier.sh`. + +4. Activate the Environment + ```sh + source path_to_nanogptENV/bin/activate + ``` + +5. Download Data + ```sh + python data/openwebtext/prepare.py + ``` \ No newline at end of file diff --git a/nanoGPT/run_frontier16.sh b/nanoGPT/run_frontier16.sh new file mode 100644 index 0000000..63718c5 --- /dev/null +++ b/nanoGPT/run_frontier16.sh @@ -0,0 +1,90 @@ +#!/bin/bash +#SBATCH -N 16 +#SBATCH -n 128 +#SBATCH -q normal +#SBATCH -J nanogpt +#SBATCH -t 01:00:00 +#SBATCH -A csc569 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-output.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-error.log +#SBATCH --exclusive +# Run like: sbatch run_frontier16.sh + +export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log +ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log + +# Run gpu benchmarks +COMM_TYPE=rccl +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH + +APP_ROOT=/lustre/orion/csc569/scratch/keshprad/nanoGPT +cd $APP_ROOT + +export SCRATCH="/lustre/orion/csc569/scratch/keshprad" +export WRKSPC="${SCRATCH}/nanoGPT" +export HF_HOME="${SCRATCH}/.cache/hf" +export HF_TRANSFORMERS_CACHE="${HF_HOME}" +export HF_DATASETS_CACHE="${HF_HOME}/datasets" +cd $WRKSPC + +# load modules +rocm_version=6.1.3 +module reset +module load PrgEnv-gnu/8.5.0 +module load rocm/${rocm_version} +module load craype-accel-amd-gfx90a +module load cray-python/3.9.13.1 +module load gcc-native/12.3 +module load cray-mpich/8.1.30 +# activate env +source ${WRKSPC}/axonn_nanogpt/bin/activate + +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 8 )) +## master addr and port +export MASTER_ADDR=$(hostname -i) +export MASTER_PORT=3442 +export WORLD_SIZE=${GPUS} + +## nccl env vars to speedup stuff +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NCCL_NET_GDR_LEVEL=PHB +export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0 +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn0 +export NCCL_NET="AWS Libfabric" +export NCCL_TIMEOUT=1200 +export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=1200 +export MPICH_GPU_SUPPORT_ENABLED=0 +# AWS-OFI-RCCL +export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH" + +SCRIPT="train_frontier.py config/train_gpt_neox_5B.py" + +# run without profiler +export WITH_PROFILER=0 +# log start date +echo start nanoGPT_withoutprof: $(date) +run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT_withoutprof.log" +echo $run_cmd +eval $run_cmd +# log end date +echo end nanoGPT_withoutprof: $(date) + + +# run with profiler +export WITH_PROFILER=1 +# log start date +echo start nanoGPT: $(date) +run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT.log" +echo $run_cmd +eval $run_cmd +# log end date +echo end nanoGPT: $(date) diff --git a/nanoGPT/run_frontier64.sh b/nanoGPT/run_frontier64.sh new file mode 100644 index 0000000..1c9a75b --- /dev/null +++ b/nanoGPT/run_frontier64.sh @@ -0,0 +1,90 @@ +#!/bin/bash +#SBATCH -N 64 +#SBATCH -n 512 +#SBATCH -q normal +#SBATCH -J nanogpt +#SBATCH -t 01:00:00 +#SBATCH -A csc569 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-output.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-error.log +#SBATCH --exclusive +# Run like: sbatch run_frontier64.sh + +export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log +ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log + +# Run gpu benchmarks +COMM_TYPE=rccl +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH + +APP_ROOT=/lustre/orion/csc569/scratch/keshprad/nanoGPT +cd $APP_ROOT + +export SCRATCH="/lustre/orion/csc569/scratch/keshprad" +export WRKSPC="${SCRATCH}/nanoGPT" +export HF_HOME="${SCRATCH}/.cache/hf" +export HF_TRANSFORMERS_CACHE="${HF_HOME}" +export HF_DATASETS_CACHE="${HF_HOME}/datasets" +cd $WRKSPC + +# load modules +rocm_version=6.1.3 +module reset +module load PrgEnv-gnu/8.5.0 +module load rocm/${rocm_version} +module load craype-accel-amd-gfx90a +module load cray-python/3.9.13.1 +module load gcc-native/12.3 +module load cray-mpich/8.1.30 +# activate env +source ${WRKSPC}/axonn_nanogpt/bin/activate + +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 8 )) +## master addr and port +export MASTER_ADDR=$(hostname -i) +export MASTER_PORT=3442 +export WORLD_SIZE=${GPUS} + +## nccl env vars to speedup stuff +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NCCL_NET_GDR_LEVEL=PHB +export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0 +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn0 +export NCCL_NET="AWS Libfabric" +export NCCL_TIMEOUT=1200 +export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=1200 +export MPICH_GPU_SUPPORT_ENABLED=0 +# AWS-OFI-RCCL +export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH" + +SCRIPT="train_frontier.py config/train_gpt_neox_20B.py" + +# run without profiler +export WITH_PROFILER=0 +# log start date +echo start nanoGPT_withoutprof: $(date) +run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT_withoutprof.log" +echo $run_cmd +eval $run_cmd +# log end date +echo end nanoGPT_withoutprof: $(date) + + +# run with profiler +export WITH_PROFILER=1 +# log start date +echo start nanoGPT: $(date) +run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT.log" +echo $run_cmd +eval $run_cmd +# log end date +echo end nanoGPT: $(date) diff --git a/nanoGPT/run_frontier_crontab.sh b/nanoGPT/run_frontier_crontab.sh new file mode 100644 index 0000000..dcc8cf5 --- /dev/null +++ b/nanoGPT/run_frontier_crontab.sh @@ -0,0 +1,19 @@ +#!/bin/bash +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi +# `16` or `64` +NUM_NODES=$1 + +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability + +# load lmod +source /usr/share/lmod/lmod/init/bash +# load default LMOD_SYSTEM_DEFAULT_MODULES and MODULEPATH +export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-trento:craype-network-ofi:perftools-base:xpmem:cray-pmi:PrgEnv-cray:DefApps +export MODULEPATH=/sw/frontier/spack-envs/modules/cce/17.0.0/cray-mpich-8.1.28/cce-17.0.0:/sw/frontier/spack-envs/modules/cce/17.0.0/cce-17.0.0:/sw/frontier/spack-envs/modules/Core/24.07:/opt/cray/pe/lmod/modulefiles/mpi/crayclang/17.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/crayclang/17.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/compiler/crayclang/17.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-trento/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/sw/frontier/modulefiles:/opt/cray/modulefiles + +# run sbatch script +script=$PERF_VARIABILITY_ROOT/nanoGPT/run_frontier$NUM_NODES\.sh +sbatch $script \ No newline at end of file diff --git a/nanoGPT/train_gpt_neox_20B_frontier.py b/nanoGPT/train_gpt_neox_20B_frontier.py new file mode 100644 index 0000000..cf7b91f --- /dev/null +++ b/nanoGPT/train_gpt_neox_20B_frontier.py @@ -0,0 +1,46 @@ +# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB +# launch as the following (e.g. in a screen session) and wait ~5 days: +# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py + +wandb_log = False +wandb_project = 'owt' +wandb_run_name='gpt2-124M' + +# these make the total batch size be ~0.5M +# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520 +batch_size = 8 +block_size = 512 +gradient_accumulation_steps = 1 * 512 #per_gpu x num_gpus + +# model +n_layer = 32 +n_head = 56 +n_embd = 7168 +dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+ +bias = False # do we use bias inside LayerNorm and Linear layers? + +# adamw optimizer +learning_rate = 1e-4 # max learning rate +max_iters = 30 # total number of training iterations + +# axonn params +G_intra_d=16 +G_intra_c=1 +G_intra_r=1 +compile=False # disable compile for axonn +gradient_checkpointing=True + +# this makes total number of tokens be 300B +max_iters = 30 +lr_decay_iters = 600000 + +# eval stuff +eval_interval = 1000 +eval_iters = 1 +log_interval = 10 + +# weight decay +weight_decay = 1e-1 + +# log every iteration +log_interval=1 \ No newline at end of file diff --git a/nanoGPT/train_gpt_neox_5B_frontier.py b/nanoGPT/train_gpt_neox_5B_frontier.py new file mode 100644 index 0000000..5fcc430 --- /dev/null +++ b/nanoGPT/train_gpt_neox_5B_frontier.py @@ -0,0 +1,46 @@ +# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB +# launch as the following (e.g. in a screen session) and wait ~5 days: +# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py + +wandb_log = False +wandb_project = 'owt' +wandb_run_name='gpt2-124M' + +# these make the total batch size be ~0.5M +# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520 +batch_size = 32 +block_size = 512 +gradient_accumulation_steps = 1 * 128 #per_gpu x num_gpus + +# model +n_layer = 24 +n_head = 32 +n_embd = 4096 +dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+ +bias = False # do we use bias inside LayerNorm and Linear layers? + +# adamw optimizer +learning_rate = 1e-4 # max learning rate +max_iters = 30 # total number of training iterations + +# axonn params +G_intra_d=16 +G_intra_c=1 +G_intra_r=1 +compile=False # disable compile for axonn +gradient_checkpointing=True + +# this makes total number of tokens be 300B +max_iters = 30 +lr_decay_iters = 600000 + +# eval stuff +eval_interval = 1000 +eval_iters = 1 +log_interval = 10 + +# weight decay +weight_decay = 1e-1 + +# log every iteration +log_interval=1 \ No newline at end of file From 7e8749e901dfdec0a55ad4d6b15a10816cc837a6 Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Fri, 27 Dec 2024 04:45:42 -0500 Subject: [PATCH 6/9] updated AMG2023 and gpu-benchmarks run scripts --- AMG2023/README.md | 17 ++++++++++++----- AMG2023/run_frontier_16.sh | 4 +++- AMG2023/run_frontier_64.sh | 4 +++- gpu-benchmarks/README.md | 14 ++++++++++++++ gpu-benchmarks/allgather/run_frontier.sh | 6 +++--- gpu-benchmarks/allreduce/run_frontier.sh | 6 +++--- gpu-benchmarks/gemm/run_frontier.sh | 6 +++--- 7 files changed, 41 insertions(+), 16 deletions(-) create mode 100644 gpu-benchmarks/README.md diff --git a/AMG2023/README.md b/AMG2023/README.md index 03832f1..14c75c8 100644 --- a/AMG2023/README.md +++ b/AMG2023/README.md @@ -60,7 +60,7 @@ Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/) module load cray-mpich/8.1.30 module load craype-accel-amd-gfx90a - module load rocm/6.2.4 + module load rocm/6.1.3 export MPICH_GPU_SUPPORT_ENABLED=1 # load compatible cmake version @@ -76,9 +76,10 @@ Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/) - Configure hypre (in hypre/src) ```sh ./configure --with-hip --enable-device-memory-pool --enable-mixedint --with-gpu-arch=gfx90a \ - --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" \ - --with-MPI-include="${MPICH_DIR}/include" \ - --with-extra-CUFLAGS="-I/opt/rocm-6.2.4/include -I/opt/rocm-6.2.4/include/rocsparse -L/opt/rocm-6.2.4/lib" + --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" \ + --with-MPI-include="${MPICH_DIR}/include" \ + CFLAGS="-I${ROCM_PATH}/include/ -I${ROCM_PATH}/llvm/include/ -I${ROCM_PATH}/include/rocsparse/" \ + LDFLAGS="-L${ROCM_PATH}/lib/ -L${ROCM_PATH}/llvm/lib/ -lrocsparse" ``` - Compile hypre (in hypre/src) ```sh @@ -91,11 +92,17 @@ Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/) git clone https://github.com/pssg-int/AMG2023` cd AMG2023 ``` + - Add mpiP to LD_LIBRARY_PATH + ```sh + export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH + ``` - Configure cmake ```sh mkdir build && cd build - cmake .. -DHYPRE_PREFIX=/ccs/home/keshprad/hypre/src/hypre/ -DCMAKE_EXE_LINKER_FLAGS="-lrocsparse -lrocrand" + cmake .. -DHYPRE_PREFIX=/ccs/home/keshprad/hypre/src/hypre/ \ + -DCMAKE_C_FLAGS="-I${ROCM_PATH}/include/ -I${ROCM_PATH}/llvm/include/ -I${ROCM_PATH}/include/rocsparse/" \ + -DCMAKE_EXE_LINKER_FLAGS="-L${ROCM_PATH}/lib/ -L${ROCM_PATH}/llvm/lib/ -lrocsparse -lrocrand" ``` - Compile AMG2023 (in AMG2023/build) ```sh diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh index 92664c3..d635c31 100644 --- a/AMG2023/run_frontier_16.sh +++ b/AMG2023/run_frontier_16.sh @@ -34,11 +34,13 @@ module reset echo loading modules: module load cray-mpich/8.1.30 module load craype-accel-amd-gfx90a -module load rocm/6.2.4 +module load rocm/6.1.3 export MPICH_GPU_SUPPORT_ENABLED=1 export CRAY_ACCEL_TARGET=gfx90a export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ +# mpiP +export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH export MPIP="-o -f $OUTPUT_DIR" # log start date diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh index eb4c6d9..8854ca1 100644 --- a/AMG2023/run_frontier_64.sh +++ b/AMG2023/run_frontier_64.sh @@ -34,12 +34,14 @@ module reset echo loading modules: module load cray-mpich/8.1.30 module load craype-accel-amd-gfx90a -module load rocm/6.2.4 +module load rocm/6.1.3 export MPICH_GPU_SUPPORT_ENABLED=1 export CRAY_ACCEL_TARGET=gfx90a export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so +# mpiP +export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH export MPIP="-o -f $OUTPUT_DIR" # log start date diff --git a/gpu-benchmarks/README.md b/gpu-benchmarks/README.md new file mode 100644 index 0000000..c8f9c25 --- /dev/null +++ b/gpu-benchmarks/README.md @@ -0,0 +1,14 @@ +# gpu-benchmarks README +Code Repository: [gpu-benchmarks](#TODO:) + +## Perlmutter Compilation + +### Steps to Compile + +TODO: + +## Frontier Compilation + +### Steps to Compile + +TODO: \ No newline at end of file diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh index cb98dd6..75216e8 100644 --- a/gpu-benchmarks/allgather/run_frontier.sh +++ b/gpu-benchmarks/allgather/run_frontier.sh @@ -23,11 +23,11 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log module reset # load modules echo loading modules: - module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4 + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 module load cray-mpich/8.1.30 - module load rocm/6.2.4 + module load rocm/6.1.3 - GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks + GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x NUM_TASKS=$(($NUM_NODES * 8)) MIN_MSG_SIZE=$((1 * 1024)) diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh index 5ac70ea..729c539 100644 --- a/gpu-benchmarks/allreduce/run_frontier.sh +++ b/gpu-benchmarks/allreduce/run_frontier.sh @@ -23,11 +23,11 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log module reset # load modules echo loading modules: - module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4 + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 module load cray-mpich/8.1.30 - module load rocm/6.2.4 + module load rocm/6.1.3 - GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks + GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x NUM_TASKS=$(($NUM_NODES * 8)) MIN_MSG_SIZE=$((1 * 1024)) diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh index 4ffd5e8..d089dd1 100644 --- a/gpu-benchmarks/gemm/run_frontier.sh +++ b/gpu-benchmarks/gemm/run_frontier.sh @@ -21,11 +21,11 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log module reset # load modules echo loading modules: - module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4 + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 module load cray-mpich/8.1.30 - module load rocm/6.2.4 + module load rocm/6.1.3 - GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks + GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x NUM_TASKS=$(($NUM_NODES * 8)) From 56e0fd5a4b4f47b807fe6db75cfeb1ab4c5476a9 Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Sat, 28 Dec 2024 21:56:25 -0500 Subject: [PATCH 7/9] use gpu-bind=none for frontier --- AMG2023/run_frontier_16.sh | 1 + AMG2023/run_frontier_64.sh | 2 +- gpu-benchmarks/allgather/run_frontier.sh | 3 ++- gpu-benchmarks/allreduce/run_frontier.sh | 3 ++- gpu-benchmarks/gemm/run_frontier.sh | 3 ++- 5 files changed, 8 insertions(+), 4 deletions(-) diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh index d635c31..c0a69b0 100644 --- a/AMG2023/run_frontier_16.sh +++ b/AMG2023/run_frontier_16.sh @@ -3,6 +3,7 @@ #SBATCH -n 128 #SBATCH -q normal #SBATCH -J amg +#SBATCH --gpu-bind none #SBATCH -t 00:30:00 #SBATCH -A csc569 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/%x-%j/output-AMG2023.log diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh index 8854ca1..8baabe8 100644 --- a/AMG2023/run_frontier_64.sh +++ b/AMG2023/run_frontier_64.sh @@ -3,6 +3,7 @@ #SBATCH -n 512 #SBATCH -q normal #SBATCH -J amg +#SBATCH --gpu-bind none #SBATCH -t 00:30:00 #SBATCH -A csc569 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/%x-%j/output-AMG2023.log @@ -39,7 +40,6 @@ module load rocm/6.1.3 export MPICH_GPU_SUPPORT_ENABLED=1 export CRAY_ACCEL_TARGET=gfx90a export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ -export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so # mpiP export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH export MPIP="-o -f $OUTPUT_DIR" diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh index 75216e8..79cedc7 100644 --- a/gpu-benchmarks/allgather/run_frontier.sh +++ b/gpu-benchmarks/allgather/run_frontier.sh @@ -26,6 +26,7 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 module load cray-mpich/8.1.30 module load rocm/6.1.3 + module list GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x @@ -50,4 +51,4 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log echo $CMD $CMD echo end allgather: $(date) -} >> $OUTPUT_FILE +} &>> $OUTPUT_FILE diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh index 729c539..56bd2fe 100644 --- a/gpu-benchmarks/allreduce/run_frontier.sh +++ b/gpu-benchmarks/allreduce/run_frontier.sh @@ -26,6 +26,7 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 module load cray-mpich/8.1.30 module load rocm/6.1.3 + module list GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x @@ -45,4 +46,4 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log echo $CMD $CMD echo end allreduce: $(date) -} >> $OUTPUT_FILE +} &>> $OUTPUT_FILE diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh index d089dd1..9ccecbd 100644 --- a/gpu-benchmarks/gemm/run_frontier.sh +++ b/gpu-benchmarks/gemm/run_frontier.sh @@ -24,6 +24,7 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 module load cray-mpich/8.1.30 module load rocm/6.1.3 + module list GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x @@ -43,4 +44,4 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log echo $CMD $CMD echo end gemm: $(date) -} >> $OUTPUT_FILE +} &>> $OUTPUT_FILE From a087255bc2a22e152a5508f3103c88bfc7847ebc Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Mon, 13 Jan 2025 17:28:22 -0500 Subject: [PATCH 8/9] update gpu-benchmarks to specify ROCM version --- AMG2023/run_frontier_16.sh | 7 ++++--- AMG2023/run_frontier_64.sh | 7 ++++--- gpu-benchmarks/allgather/run_frontier.sh | 25 ++++++++++++++++-------- gpu-benchmarks/allreduce/run_frontier.sh | 25 ++++++++++++++++-------- gpu-benchmarks/gemm/run_frontier.sh | 25 ++++++++++++++++-------- 5 files changed, 59 insertions(+), 30 deletions(-) diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh index c0a69b0..c51b52d 100644 --- a/AMG2023/run_frontier_16.sh +++ b/AMG2023/run_frontier_16.sh @@ -17,13 +17,14 @@ ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log # Run gpu benchmarks COMM_TYPE=mpi +ROCM_VERSION=6.1.3 PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability echo running allreduce benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR # echo running allgather benchmark -# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR echo running gemm benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $OUTPUT_DIR +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR APP_ROOT=/ccs/home/keshprad/AMG2023 cd $APP_ROOT diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh index 8baabe8..c7a7a3e 100644 --- a/AMG2023/run_frontier_64.sh +++ b/AMG2023/run_frontier_64.sh @@ -17,13 +17,14 @@ ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log # Run gpu benchmarks COMM_TYPE=mpi +ROCM_VERSION=6.1.3 PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability echo running allreduce benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR # echo running allgather benchmark -# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR echo running gemm benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $OUTPUT_DIR +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR APP_ROOT=/ccs/home/keshprad/AMG2023 cd $APP_ROOT diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh index 79cedc7..7fc10b4 100644 --- a/gpu-benchmarks/allgather/run_frontier.sh +++ b/gpu-benchmarks/allgather/run_frontier.sh @@ -4,16 +4,25 @@ # run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allgather.sh #!/bin/bash -if [ "$#" -ne 3 ]; then - echo "Usage: $0 " +if [ "$#" -ne 4 ]; then + echo "Usage: $0 " exit 1 fi # `mpi` or `rccl` COMM_TYPE=$1 +# `5.7.1` or `6.1.3` +ROCM_VERSION=$2 # `16` or `64` -NUM_NODES=$2 +NUM_NODES=$3 # output directory -OUTPUT_DIR=$3 +OUTPUT_DIR=$4 + +# setup cray-mpich version +if [[ "$ROCM_VERSION" == "6.1.3" ]]; then + MPICH_VERSION=8.1.30 +else + MPICH_VERSION=8.1.28 +fi OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log @@ -23,13 +32,13 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log module reset # load modules echo loading modules: - module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 - module load cray-mpich/8.1.30 - module load rocm/6.1.3 + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION} + module load cray-mpich/${MPICH_VERSION} + module load rocm/${ROCM_VERSION} module list GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks - EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x + EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE\_rocm-${ROCM_VERSION}.x NUM_TASKS=$(($NUM_NODES * 8)) MIN_MSG_SIZE=$((1 * 1024)) MAX_MSG_SIZE=$((1 * 1024 * 1024)) diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh index 56bd2fe..855a486 100644 --- a/gpu-benchmarks/allreduce/run_frontier.sh +++ b/gpu-benchmarks/allreduce/run_frontier.sh @@ -4,16 +4,25 @@ # run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allreduce.sh #!/bin/bash -if [ "$#" -ne 3 ]; then - echo "Usage: $0 " +if [ "$#" -ne 4 ]; then + echo "Usage: $0 " exit 1 fi # `mpi` or `rccl` COMM_TYPE=$1 +# `5.7.1` or `6.1.3` +ROCM_VERSION=$2 # `16` or `64` -NUM_NODES=$2 +NUM_NODES=$3 # output directory -OUTPUT_DIR=$3 +OUTPUT_DIR=$4 + +# setup cray-mpich version +if [[ "$ROCM_VERSION" == "6.1.3" ]]; then + MPICH_VERSION=8.1.30 +else + MPICH_VERSION=8.1.28 +fi OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log @@ -23,13 +32,13 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log module reset # load modules echo loading modules: - module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 - module load cray-mpich/8.1.30 - module load rocm/6.1.3 + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION} + module load cray-mpich/${MPICH_VERSION} + module load rocm/${ROCM_VERSION} module list GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks - EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x + EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE\_rocm-${ROCM_VERSION}.x NUM_TASKS=$(($NUM_NODES * 8)) MIN_MSG_SIZE=$((1 * 1024)) MAX_MSG_SIZE=$((1 * 1024 * 1024)) diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh index 9ccecbd..c5348be 100644 --- a/gpu-benchmarks/gemm/run_frontier.sh +++ b/gpu-benchmarks/gemm/run_frontier.sh @@ -4,14 +4,23 @@ # run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/gemm.sh #!/bin/bash -if [ "$#" -ne 2 ]; then - echo "Usage: $0 " +if [ "$#" -ne 3 ]; then + echo "Usage: $0 " exit 1 fi +# `5.7.1` or `6.1.3` +ROCM_VERSION=$1 # `16` or `64` -NUM_NODES=$1 +NUM_NODES=$2 # output directory -OUTPUT_DIR=$2 +OUTPUT_DIR=$3 + +# setup cray-mpich version +if [[ "$ROCM_VERSION" == "6.1.3" ]]; then + MPICH_VERSION=8.1.30 +else + MPICH_VERSION=8.1.28 +fi OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log @@ -21,13 +30,13 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log module reset # load modules echo loading modules: - module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 - module load cray-mpich/8.1.30 - module load rocm/6.1.3 + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION} + module load cray-mpich/${MPICH_VERSION} + module load rocm/${ROCM_VERSION} module list GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks - EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x + EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm_rocm-${ROCM_VERSION}.x NUM_TASKS=$(($NUM_NODES * 8)) export MPICH_GPU_SUPPORT_ENABLED=1 From a68f2a16f3b0a1cded21ddf8b7cafd46c293527a Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Mon, 13 Jan 2025 17:36:47 -0500 Subject: [PATCH 9/9] updated nanogpt scripts and reduced batch size due to HIP OOM errors --- nanoGPT/run_frontier16.sh | 84 +++++++++++++-------------- nanoGPT/run_frontier64.sh | 84 +++++++++++++-------------- nanoGPT/train_gpt_neox_5B_frontier.py | 4 +- 3 files changed, 82 insertions(+), 90 deletions(-) diff --git a/nanoGPT/run_frontier16.sh b/nanoGPT/run_frontier16.sh index 63718c5..901561e 100644 --- a/nanoGPT/run_frontier16.sh +++ b/nanoGPT/run_frontier16.sh @@ -3,30 +3,19 @@ #SBATCH -n 128 #SBATCH -q normal #SBATCH -J nanogpt -#SBATCH -t 01:00:00 +#SBATCH --gpu-bind none +#SBATCH -t 00:30:00 #SBATCH -A csc569 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-output.log #SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-error.log #SBATCH --exclusive # Run like: sbatch run_frontier16.sh +echo "start run: $(date)" export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log -# Run gpu benchmarks -COMM_TYPE=rccl -PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability -echo running allreduce benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH -# echo running allgather benchmark -# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH -echo running gemm benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH - -APP_ROOT=/lustre/orion/csc569/scratch/keshprad/nanoGPT -cd $APP_ROOT - export SCRATCH="/lustre/orion/csc569/scratch/keshprad" export WRKSPC="${SCRATCH}/nanoGPT" export HF_HOME="${SCRATCH}/.cache/hf" @@ -35,56 +24,63 @@ export HF_DATASETS_CACHE="${HF_HOME}/datasets" cd $WRKSPC # load modules -rocm_version=6.1.3 +ROCM_VERSION=6.1.3 +echo resetting modules: module reset +echo loading modules: module load PrgEnv-gnu/8.5.0 -module load rocm/${rocm_version} +module load rocm/${ROCM_VERSION} module load craype-accel-amd-gfx90a module load cray-python/3.9.13.1 -module load gcc-native/12.3 module load cray-mpich/8.1.30 +module list # activate env source ${WRKSPC}/axonn_nanogpt/bin/activate NNODES=$SLURM_JOB_NUM_NODES GPUS=$(( NNODES * 8 )) ## master addr and port -export MASTER_ADDR=$(hostname -i) -export MASTER_PORT=3442 -export WORLD_SIZE=${GPUS} +# setting variables for torch.distributed +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export WORLD_SIZE=$GPUS +export OMP_NUM_THREADS=7 -## nccl env vars to speedup stuff -export CUDA_DEVICE_MAX_CONNECTIONS=1 -export NCCL_NET_GDR_LEVEL=PHB -export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0 +## some RCCL env variables +export FI_CXI_ATS=0 +export HSA_FORCE_FINE_GRAIN_PCIE=1 export NCCL_CROSS_NIC=1 export NCCL_SOCKET_IFNAME=hsn0 -export NCCL_NET="AWS Libfabric" -export NCCL_TIMEOUT=1200 -export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=1200 -export MPICH_GPU_SUPPORT_ENABLED=0 +export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0 +export CUDA_DEVICE_MAX_CONNECTIONS=1 # AWS-OFI-RCCL export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH" +# other +export MPICH_GPU_SUPPORT_ENABLED=1 +export GPU_MAX_HW_QUEUES=1 +export OFI_NCCL_USE_IPV6_TCP=1 SCRIPT="train_frontier.py config/train_gpt_neox_5B.py" -# run without profiler -export WITH_PROFILER=0 -# log start date -echo start nanoGPT_withoutprof: $(date) -run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT_withoutprof.log" -echo $run_cmd -eval $run_cmd -# log end date -echo end nanoGPT_withoutprof: $(date) - - # run with profiler export WITH_PROFILER=1 +OUTPUT_FILE="$JOB_OUTPUT_PATH/output-nanoGPT.log" # log start date -echo start nanoGPT: $(date) -run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT.log" -echo $run_cmd -eval $run_cmd +echo "start nanoGPT: $(date)" &>> $OUTPUT_FILE +run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT" +echo $run_cmd &>> $OUTPUT_FILE +eval $run_cmd &>> $OUTPUT_FILE # log end date -echo end nanoGPT: $(date) +echo "end nanoGPT: $(date)" &>> $OUTPUT_FILE + +# Run gpu benchmarks +COMM_TYPE=rccl +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH + +echo "end run: $(date)" \ No newline at end of file diff --git a/nanoGPT/run_frontier64.sh b/nanoGPT/run_frontier64.sh index 1c9a75b..3201b51 100644 --- a/nanoGPT/run_frontier64.sh +++ b/nanoGPT/run_frontier64.sh @@ -3,30 +3,19 @@ #SBATCH -n 512 #SBATCH -q normal #SBATCH -J nanogpt -#SBATCH -t 01:00:00 +#SBATCH --gpu-bind none +#SBATCH -t 00:30:00 #SBATCH -A csc569 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-output.log #SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-error.log #SBATCH --exclusive # Run like: sbatch run_frontier64.sh +echo "start run: $(date)" export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log -# Run gpu benchmarks -COMM_TYPE=rccl -PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability -echo running allreduce benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH -# echo running allgather benchmark -# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH -echo running gemm benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH - -APP_ROOT=/lustre/orion/csc569/scratch/keshprad/nanoGPT -cd $APP_ROOT - export SCRATCH="/lustre/orion/csc569/scratch/keshprad" export WRKSPC="${SCRATCH}/nanoGPT" export HF_HOME="${SCRATCH}/.cache/hf" @@ -35,56 +24,63 @@ export HF_DATASETS_CACHE="${HF_HOME}/datasets" cd $WRKSPC # load modules -rocm_version=6.1.3 +ROCM_VERSION=6.1.3 +echo resetting modules: module reset +echo loading modules: module load PrgEnv-gnu/8.5.0 -module load rocm/${rocm_version} +module load rocm/${ROCM_VERSION} module load craype-accel-amd-gfx90a module load cray-python/3.9.13.1 -module load gcc-native/12.3 module load cray-mpich/8.1.30 +module list # activate env source ${WRKSPC}/axonn_nanogpt/bin/activate NNODES=$SLURM_JOB_NUM_NODES GPUS=$(( NNODES * 8 )) ## master addr and port -export MASTER_ADDR=$(hostname -i) -export MASTER_PORT=3442 -export WORLD_SIZE=${GPUS} +# setting variables for torch.distributed +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export WORLD_SIZE=$GPUS +export OMP_NUM_THREADS=7 -## nccl env vars to speedup stuff -export CUDA_DEVICE_MAX_CONNECTIONS=1 -export NCCL_NET_GDR_LEVEL=PHB -export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0 +## some RCCL env variables +export FI_CXI_ATS=0 +export HSA_FORCE_FINE_GRAIN_PCIE=1 export NCCL_CROSS_NIC=1 export NCCL_SOCKET_IFNAME=hsn0 -export NCCL_NET="AWS Libfabric" -export NCCL_TIMEOUT=1200 -export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=1200 -export MPICH_GPU_SUPPORT_ENABLED=0 +export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0 +export CUDA_DEVICE_MAX_CONNECTIONS=1 # AWS-OFI-RCCL export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH" +# other +export MPICH_GPU_SUPPORT_ENABLED=1 +export GPU_MAX_HW_QUEUES=1 +export OFI_NCCL_USE_IPV6_TCP=1 SCRIPT="train_frontier.py config/train_gpt_neox_20B.py" -# run without profiler -export WITH_PROFILER=0 -# log start date -echo start nanoGPT_withoutprof: $(date) -run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT_withoutprof.log" -echo $run_cmd -eval $run_cmd -# log end date -echo end nanoGPT_withoutprof: $(date) - - # run with profiler export WITH_PROFILER=1 +OUTPUT_FILE="$JOB_OUTPUT_PATH/output-nanoGPT.log" # log start date -echo start nanoGPT: $(date) -run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT.log" -echo $run_cmd -eval $run_cmd +echo "start nanoGPT: $(date)" &>> $OUTPUT_FILE +run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT" +echo $run_cmd &>> $OUTPUT_FILE +eval $run_cmd &>> $OUTPUT_FILE # log end date -echo end nanoGPT: $(date) +echo "end nanoGPT: $(date)" &>> $OUTPUT_FILE + +# Run gpu benchmarks +COMM_TYPE=rccl +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH + +echo "end run: $(date)" \ No newline at end of file diff --git a/nanoGPT/train_gpt_neox_5B_frontier.py b/nanoGPT/train_gpt_neox_5B_frontier.py index 5fcc430..4ce7b55 100644 --- a/nanoGPT/train_gpt_neox_5B_frontier.py +++ b/nanoGPT/train_gpt_neox_5B_frontier.py @@ -8,9 +8,9 @@ # these make the total batch size be ~0.5M # 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520 -batch_size = 32 +batch_size = 16 block_size = 512 -gradient_accumulation_steps = 1 * 128 #per_gpu x num_gpus +gradient_accumulation_steps = 2 * 128 #per_gpu x num_gpus # model n_layer = 24