From ca42dbd33517c93b924609833ce4d92fc32b7ad2 Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Wed, 4 Dec 2024 14:46:33 -0500 Subject: [PATCH 1/7] add frontier install instrs for AMG2023 --- AMG2023/README.md | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/AMG2023/README.md b/AMG2023/README.md index 476ad56..71af9f6 100644 --- a/AMG2023/README.md +++ b/AMG2023/README.md @@ -51,4 +51,47 @@ Repository: [AMG2023](https://github.com/pssg-int/AMG2023) ``` ## Frontier Installation +1. Load modules + ```sh + module reset + + module load cray-mpich/8.1.28 + module load craype-accel-amd-gfx90a + module load rocm + export MPICH_GPU_SUPPORT_ENABLED=1 + + # load compatible cmake version + module load Core/24.07 + module load cmake/3.27.9 + ``` +2. Configure hypre + - Clone hypre v2.27.0 and navigate to src: + ```sh + git clone -b v2.27.0 https://github.com/hypre-space/hypre.git + cd into ~/hypre/src + ``` + - Configure hypre (in hypre/src) + ```sh + ./configure --with-hip --with-gpu-arch=gfx90a --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" --with-MPI-include="${MPICH_DIR}/include" + ``` + - Compile hypre (in hypre/src) + ```sh + # build with make + make + ``` +3. Configure AMG2023 + - Clone repo: + ```sh + git clone https://github.com/pssg-int/AMG2023` + cd AMG2023 + ``` + - Configure cmake + ```sh + mkdir build && cd build + cmake .. -DHYPRE_PREFIX=/ccs/home/keshprad/hypre/src/hypre/ -DCMAKE_EXE_LINKER_FLAGS="-lrocsparse -lrocrand" + ``` + - Compile AMG2023 (in AMG2023/build) + ```sh + make install + ``` From 28abeca38d9f9396e89d5ef2091720a371f5f4f8 Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Tue, 17 Dec 2024 18:06:04 -0500 Subject: [PATCH 2/7] frontier scripts for AMG and for gpu benchmarks --- AMG2023/run_frontier_16.sh | 56 ++++++++++++++++++++++++ AMG2023/run_frontier_64.sh | 56 ++++++++++++++++++++++++ AMG2023/run_frontier_crontab.sh | 19 ++++++++ gpu-benchmarks/allgather/run_frontier.sh | 51 +++++++++++++++++++++ gpu-benchmarks/allreduce/run_frontier.sh | 46 +++++++++++++++++++ gpu-benchmarks/gemm/run_frontier.sh | 44 +++++++++++++++++++ 6 files changed, 272 insertions(+) create mode 100644 AMG2023/run_frontier_16.sh create mode 100644 AMG2023/run_frontier_64.sh create mode 100644 AMG2023/run_frontier_crontab.sh create mode 100644 gpu-benchmarks/allgather/run_frontier.sh create mode 100644 gpu-benchmarks/allreduce/run_frontier.sh create mode 100644 gpu-benchmarks/gemm/run_frontier.sh diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh new file mode 100644 index 0000000..8546887 --- /dev/null +++ b/AMG2023/run_frontier_16.sh @@ -0,0 +1,56 @@ +#!/bin/bash +#SBATCH -N 16 +#SBATCH -n 128 +#SBATCH -q normal +#SBATCH -J amg +#SBATCH -t 00:30:00 +#SBATCH -A csc569 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/%x-%j/output-AMG2023.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/%x-%j/error-AMG2023.log +#SBATCH --exclusive +# Run like: sbatch run_frontier_16.sh + +OUTPUT_DIR=/lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +OUTPUT_FILE=$OUTPUT_DIR/output-AMG2023.log +ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log + +# Run gpu benchmarks +COMM_TYPE=mpi +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $OUTPUT_DIR + +APP_ROOT=/ccs/home/keshprad/AMG2023 +cd $APP_ROOT + +# reset modules +echo resetting modules: +module reset +# load modules +echo loading modules: +module load cray-mpich/8.1.28 +module load craype-accel-amd-gfx90a +module load rocm + +export MPICH_GPU_SUPPORT_ENABLED=1 +export CRAY_ACCEL_TARGET=gfx90a +export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ +export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so +export MPIP="-f $OUTPUT_DIR" + +# log start date +echo start AMG2023: $(date) +# define command +cmd="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \ + --output $OUTPUT_FILE \ + --error $ERROR_FILE \ + ./build/amg -P 4 4 8 -n 128 64 64 -problem 1 -iter 500" +echo solving: +echo $cmd +$cmd +# log end date +echo end AMG2023: $(date) diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh new file mode 100644 index 0000000..c28de6a --- /dev/null +++ b/AMG2023/run_frontier_64.sh @@ -0,0 +1,56 @@ +#!/bin/bash +#SBATCH -N 64 +#SBATCH -n 512 +#SBATCH -q normal +#SBATCH -J amg +#SBATCH -t 00:30:00 +#SBATCH -A csc569 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/%x-%j/output-AMG2023.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/%x-%j/error-AMG2023.log +#SBATCH --exclusive +# Run like: sbatch run_frontier_64.sh + +OUTPUT_DIR=/lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +OUTPUT_FILE=$OUTPUT_DIR/output-AMG2023.log +ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log + +# Run gpu benchmarks +COMM_TYPE=mpi +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $OUTPUT_DIR + +APP_ROOT=/ccs/home/keshprad/AMG2023 +cd $APP_ROOT + +# reset modules +echo resetting modules: +module reset +# load modules +echo loading modules: +module load cray-mpich/8.1.28 +module load craype-accel-amd-gfx90a +module load rocm + +export MPICH_GPU_SUPPORT_ENABLED=1 +export CRAY_ACCEL_TARGET=gfx90a +export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ +export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so +export MPIP="-f $OUTPUT_DIR" + +# log start date +echo start AMG2023: $(date) +# define command +cmd="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \ + --output $OUTPUT_FILE \ + --error $ERROR_FILE \ + ./build/amg -P 8 8 8 -n 128 64 64 -problem 1 -iter 500" +echo solving: +echo $cmd +$cmd +# log end date +echo end AMG2023: $(date) diff --git a/AMG2023/run_frontier_crontab.sh b/AMG2023/run_frontier_crontab.sh new file mode 100644 index 0000000..09b0f66 --- /dev/null +++ b/AMG2023/run_frontier_crontab.sh @@ -0,0 +1,19 @@ +#!/bin/bash +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi +# `16` or `64` +NUM_NODES=$1 + +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability + +# load lmod +source /usr/share/lmod/lmod/init/bash +# load default LMOD_SYSTEM_DEFAULT_MODULES and MODULEPATH +export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-trento:craype-network-ofi:perftools-base:xpmem:cray-pmi:PrgEnv-cray:DefApps +export MODULEPATH=/sw/frontier/spack-envs/modules/cce/17.0.0/cray-mpich-8.1.28/cce-17.0.0:/sw/frontier/spack-envs/modules/cce/17.0.0/cce-17.0.0:/sw/frontier/spack-envs/modules/Core/24.07:/opt/cray/pe/lmod/modulefiles/mpi/crayclang/17.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/crayclang/17.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/compiler/crayclang/17.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-trento/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/sw/frontier/modulefiles:/opt/cray/modulefiles + +# run sbatch script +script=$PERF_VARIABILITY_ROOT/AMG2023/run_frontier_$NUM_NODES\.sh +sbatch $script \ No newline at end of file diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh new file mode 100644 index 0000000..dfd7bfe --- /dev/null +++ b/gpu-benchmarks/allgather/run_frontier.sh @@ -0,0 +1,51 @@ +# This script assumes it is being run by another sbatch script, +# so does not include portions for SBATCH vars (e.g. account, time, etc.) + +# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allgather.sh + +#!/bin/bash +if [ "$#" -ne 3 ]; then + echo "Usage: $0 " + exit 1 +fi +# `mpi` or `rccl` +COMM_TYPE=$1 +# `16` or `64` +NUM_NODES=$2 +# output directory +OUTPUT_DIR=$3 + +OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log + +{ + # reset modules + echo resetting modules: + module reset + # load modules + echo loading modules: + module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm + + GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks + EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x + NUM_TASKS=$(($NUM_NODES * 8)) + MIN_MSG_SIZE=$((1 * 1024)) + MAX_MSG_SIZE=$((1 * 1024 * 1024)) + ITERATIONS=100 + + export MPICH_GPU_SUPPORT_ENABLED=1 + export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}" + + echo start allgather: $(date) + For MPI-bench we should use --gpus-per-node --gpus-per-task --ntasks-per-node , and --gpu-bind=none in srun. + CMD="srun -N $NUM_NODES -n $NUM_TASKS \ + --gpus-per-node 8 \ + --gpus-per-task 1 \ + --ntasks-per-node 8 \ + --gpu-bind none \ + --output $OUTPUT_FILE \ + $EXEC $NUM_TASKS $MIN_MSG_SIZE $MAX_MSG_SIZE $ITERATIONS" + echo running: + echo $CMD + $CMD + echo end allgather: $(date) +} >> $OUTPUT_FILE diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh new file mode 100644 index 0000000..caafc1a --- /dev/null +++ b/gpu-benchmarks/allreduce/run_frontier.sh @@ -0,0 +1,46 @@ +# This script assumes it is being run by another sbatch script, +# so does not include portions for SBATCH vars (e.g. account, time, etc.) + +# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allreduce.sh + +#!/bin/bash +if [ "$#" -ne 3 ]; then + echo "Usage: $0 " + exit 1 +fi +# `mpi` or `rccl` +COMM_TYPE=$1 +# `16` or `64` +NUM_NODES=$2 +# output directory +OUTPUT_DIR=$3 + +OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log + +{ + # reset modules + echo resetting modules: + module reset + # load modules + echo loading modules: + module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm + + GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks + EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x + NUM_TASKS=$(($NUM_NODES * 8)) + MIN_MSG_SIZE=$((1 * 1024)) + MAX_MSG_SIZE=$((1 * 1024 * 1024)) + ITERATIONS=100 + + export MPICH_GPU_SUPPORT_ENABLED=1 + export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}" + + echo start allreduce: $(date) + CMD="srun -N $NUM_NODES -n $NUM_TASKS \ + --output $OUTPUT_FILE \ + $EXEC $NUM_TASKS $MIN_MSG_SIZE $MAX_MSG_SIZE $ITERATIONS" + echo running: + echo $CMD + $CMD + echo end allreduce: $(date) +} >> $OUTPUT_FILE diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh new file mode 100644 index 0000000..6f9bb5b --- /dev/null +++ b/gpu-benchmarks/gemm/run_frontier.sh @@ -0,0 +1,44 @@ +# This script assumes it is being run by another sbatch script, +# so does not include portions for SBATCH vars (e.g. account, time, etc.) + +# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/gemm.sh + +#!/bin/bash +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi +# `16` or `64` +NUM_NODES=$1 +# output directory +OUTPUT_DIR=$2 + +OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log + +{ + # reset modules + echo resetting modules: + module reset + # load modules + echo loading modules: + module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm + + GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks + EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x + NUM_TASKS=$(($NUM_NODES * 8)) + + export MPICH_GPU_SUPPORT_ENABLED=1 + export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}" + + echo start gemm: $(date) + CMD="srun -N $NUM_NODES -n $NUM_TASKS \ + --gpus-per-node 8 \ + --gpus-per-task 1 \ + --ntasks-per-node 8 \ + --output $OUTPUT_FILE \ + $EXEC" + echo running: + echo $CMD + $CMD + echo end gemm: $(date) +} >> $OUTPUT_FILE From d59c821dd5b11603f99e984df12cdb7cf00f8c24 Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Wed, 18 Dec 2024 02:18:39 -0500 Subject: [PATCH 3/7] reformat readme --- AMG2023/README.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/AMG2023/README.md b/AMG2023/README.md index 71af9f6..3e9b90e 100644 --- a/AMG2023/README.md +++ b/AMG2023/README.md @@ -1,9 +1,9 @@ # AMG2023 README For more detailed installation parameters, please refer to the [installation document](https://github.com/pssg-int/AMG2023/blob/main/amg-doc.pdf). -## Perlmutter Compilation +Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/) -Repository: [AMG2023](https://github.com/pssg-int/AMG2023) +## Perlmutter Compilation ### Steps to Compile @@ -50,7 +50,10 @@ Repository: [AMG2023](https://github.com/pssg-int/AMG2023) cmake -DHYPRE_PREFIX=/pscratch/sd/c/cunyang/AMG2023 .. ``` -## Frontier Installation +## Frontier Compilation + +### Steps to Compile + 1. Load modules ```sh module reset From c76505da4eeba3de36130bf3345d88cd4236aaad Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Wed, 25 Dec 2024 01:05:56 -0500 Subject: [PATCH 4/7] update AMG2023 and gpu-benchmarks scripts to use newest rocm and cray-mpich versions available on frontier --- AMG2023/README.md | 15 +++++++++------ AMG2023/run_frontier_16.sh | 11 ++++------- AMG2023/run_frontier_64.sh | 10 ++++------ gpu-benchmarks/allgather/run_frontier.sh | 4 +++- gpu-benchmarks/allreduce/run_frontier.sh | 4 +++- gpu-benchmarks/gemm/run_frontier.sh | 4 +++- 6 files changed, 26 insertions(+), 22 deletions(-) diff --git a/AMG2023/README.md b/AMG2023/README.md index 3e9b90e..03832f1 100644 --- a/AMG2023/README.md +++ b/AMG2023/README.md @@ -58,24 +58,27 @@ Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/) ```sh module reset - module load cray-mpich/8.1.28 + module load cray-mpich/8.1.30 module load craype-accel-amd-gfx90a - module load rocm + module load rocm/6.2.4 export MPICH_GPU_SUPPORT_ENABLED=1 # load compatible cmake version module load Core/24.07 module load cmake/3.27.9 ``` -2. Configure hypre - - Clone hypre v2.27.0 and navigate to src: +2. Configure hypre (v2.32.0) + - Clone hypre v2.32.0 and navigate to src: ```sh - git clone -b v2.27.0 https://github.com/hypre-space/hypre.git + git clone -b v2.32.0 https://github.com/hypre-space/hypre.git cd into ~/hypre/src ``` - Configure hypre (in hypre/src) ```sh - ./configure --with-hip --with-gpu-arch=gfx90a --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" --with-MPI-include="${MPICH_DIR}/include" + ./configure --with-hip --enable-device-memory-pool --enable-mixedint --with-gpu-arch=gfx90a \ + --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" \ + --with-MPI-include="${MPICH_DIR}/include" \ + --with-extra-CUFLAGS="-I/opt/rocm-6.2.4/include -I/opt/rocm-6.2.4/include/rocsparse -L/opt/rocm-6.2.4/lib" ``` - Compile hypre (in hypre/src) ```sh diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh index 8546887..92664c3 100644 --- a/AMG2023/run_frontier_16.sh +++ b/AMG2023/run_frontier_16.sh @@ -32,22 +32,19 @@ echo resetting modules: module reset # load modules echo loading modules: -module load cray-mpich/8.1.28 +module load cray-mpich/8.1.30 module load craype-accel-amd-gfx90a -module load rocm +module load rocm/6.2.4 export MPICH_GPU_SUPPORT_ENABLED=1 export CRAY_ACCEL_TARGET=gfx90a export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ -export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so -export MPIP="-f $OUTPUT_DIR" +export MPIP="-o -f $OUTPUT_DIR" # log start date echo start AMG2023: $(date) # define command -cmd="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \ - --output $OUTPUT_FILE \ - --error $ERROR_FILE \ +cmd="srun --output $OUTPUT_FILE --error $ERROR_FILE \ ./build/amg -P 4 4 8 -n 128 64 64 -problem 1 -iter 500" echo solving: echo $cmd diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh index c28de6a..eb4c6d9 100644 --- a/AMG2023/run_frontier_64.sh +++ b/AMG2023/run_frontier_64.sh @@ -32,22 +32,20 @@ echo resetting modules: module reset # load modules echo loading modules: -module load cray-mpich/8.1.28 +module load cray-mpich/8.1.30 module load craype-accel-amd-gfx90a -module load rocm +module load rocm/6.2.4 export MPICH_GPU_SUPPORT_ENABLED=1 export CRAY_ACCEL_TARGET=gfx90a export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so -export MPIP="-f $OUTPUT_DIR" +export MPIP="-o -f $OUTPUT_DIR" # log start date echo start AMG2023: $(date) # define command -cmd="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \ - --output $OUTPUT_FILE \ - --error $ERROR_FILE \ +cmd="srun --output $OUTPUT_FILE --error $ERROR_FILE \ ./build/amg -P 8 8 8 -n 128 64 64 -problem 1 -iter 500" echo solving: echo $cmd diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh index dfd7bfe..cb98dd6 100644 --- a/gpu-benchmarks/allgather/run_frontier.sh +++ b/gpu-benchmarks/allgather/run_frontier.sh @@ -23,7 +23,9 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log module reset # load modules echo loading modules: - module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4 + module load cray-mpich/8.1.30 + module load rocm/6.2.4 GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh index caafc1a..5ac70ea 100644 --- a/gpu-benchmarks/allreduce/run_frontier.sh +++ b/gpu-benchmarks/allreduce/run_frontier.sh @@ -23,7 +23,9 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log module reset # load modules echo loading modules: - module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4 + module load cray-mpich/8.1.30 + module load rocm/6.2.4 GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh index 6f9bb5b..4ffd5e8 100644 --- a/gpu-benchmarks/gemm/run_frontier.sh +++ b/gpu-benchmarks/gemm/run_frontier.sh @@ -21,7 +21,9 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log module reset # load modules echo loading modules: - module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4 + module load cray-mpich/8.1.30 + module load rocm/6.2.4 GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x From 7e8749e901dfdec0a55ad4d6b15a10816cc837a6 Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Fri, 27 Dec 2024 04:45:42 -0500 Subject: [PATCH 5/7] updated AMG2023 and gpu-benchmarks run scripts --- AMG2023/README.md | 17 ++++++++++++----- AMG2023/run_frontier_16.sh | 4 +++- AMG2023/run_frontier_64.sh | 4 +++- gpu-benchmarks/README.md | 14 ++++++++++++++ gpu-benchmarks/allgather/run_frontier.sh | 6 +++--- gpu-benchmarks/allreduce/run_frontier.sh | 6 +++--- gpu-benchmarks/gemm/run_frontier.sh | 6 +++--- 7 files changed, 41 insertions(+), 16 deletions(-) create mode 100644 gpu-benchmarks/README.md diff --git a/AMG2023/README.md b/AMG2023/README.md index 03832f1..14c75c8 100644 --- a/AMG2023/README.md +++ b/AMG2023/README.md @@ -60,7 +60,7 @@ Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/) module load cray-mpich/8.1.30 module load craype-accel-amd-gfx90a - module load rocm/6.2.4 + module load rocm/6.1.3 export MPICH_GPU_SUPPORT_ENABLED=1 # load compatible cmake version @@ -76,9 +76,10 @@ Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/) - Configure hypre (in hypre/src) ```sh ./configure --with-hip --enable-device-memory-pool --enable-mixedint --with-gpu-arch=gfx90a \ - --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" \ - --with-MPI-include="${MPICH_DIR}/include" \ - --with-extra-CUFLAGS="-I/opt/rocm-6.2.4/include -I/opt/rocm-6.2.4/include/rocsparse -L/opt/rocm-6.2.4/lib" + --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" \ + --with-MPI-include="${MPICH_DIR}/include" \ + CFLAGS="-I${ROCM_PATH}/include/ -I${ROCM_PATH}/llvm/include/ -I${ROCM_PATH}/include/rocsparse/" \ + LDFLAGS="-L${ROCM_PATH}/lib/ -L${ROCM_PATH}/llvm/lib/ -lrocsparse" ``` - Compile hypre (in hypre/src) ```sh @@ -91,11 +92,17 @@ Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/) git clone https://github.com/pssg-int/AMG2023` cd AMG2023 ``` + - Add mpiP to LD_LIBRARY_PATH + ```sh + export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH + ``` - Configure cmake ```sh mkdir build && cd build - cmake .. -DHYPRE_PREFIX=/ccs/home/keshprad/hypre/src/hypre/ -DCMAKE_EXE_LINKER_FLAGS="-lrocsparse -lrocrand" + cmake .. -DHYPRE_PREFIX=/ccs/home/keshprad/hypre/src/hypre/ \ + -DCMAKE_C_FLAGS="-I${ROCM_PATH}/include/ -I${ROCM_PATH}/llvm/include/ -I${ROCM_PATH}/include/rocsparse/" \ + -DCMAKE_EXE_LINKER_FLAGS="-L${ROCM_PATH}/lib/ -L${ROCM_PATH}/llvm/lib/ -lrocsparse -lrocrand" ``` - Compile AMG2023 (in AMG2023/build) ```sh diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh index 92664c3..d635c31 100644 --- a/AMG2023/run_frontier_16.sh +++ b/AMG2023/run_frontier_16.sh @@ -34,11 +34,13 @@ module reset echo loading modules: module load cray-mpich/8.1.30 module load craype-accel-amd-gfx90a -module load rocm/6.2.4 +module load rocm/6.1.3 export MPICH_GPU_SUPPORT_ENABLED=1 export CRAY_ACCEL_TARGET=gfx90a export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ +# mpiP +export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH export MPIP="-o -f $OUTPUT_DIR" # log start date diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh index eb4c6d9..8854ca1 100644 --- a/AMG2023/run_frontier_64.sh +++ b/AMG2023/run_frontier_64.sh @@ -34,12 +34,14 @@ module reset echo loading modules: module load cray-mpich/8.1.30 module load craype-accel-amd-gfx90a -module load rocm/6.2.4 +module load rocm/6.1.3 export MPICH_GPU_SUPPORT_ENABLED=1 export CRAY_ACCEL_TARGET=gfx90a export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so +# mpiP +export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH export MPIP="-o -f $OUTPUT_DIR" # log start date diff --git a/gpu-benchmarks/README.md b/gpu-benchmarks/README.md new file mode 100644 index 0000000..c8f9c25 --- /dev/null +++ b/gpu-benchmarks/README.md @@ -0,0 +1,14 @@ +# gpu-benchmarks README +Code Repository: [gpu-benchmarks](#TODO:) + +## Perlmutter Compilation + +### Steps to Compile + +TODO: + +## Frontier Compilation + +### Steps to Compile + +TODO: \ No newline at end of file diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh index cb98dd6..75216e8 100644 --- a/gpu-benchmarks/allgather/run_frontier.sh +++ b/gpu-benchmarks/allgather/run_frontier.sh @@ -23,11 +23,11 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log module reset # load modules echo loading modules: - module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4 + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 module load cray-mpich/8.1.30 - module load rocm/6.2.4 + module load rocm/6.1.3 - GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks + GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x NUM_TASKS=$(($NUM_NODES * 8)) MIN_MSG_SIZE=$((1 * 1024)) diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh index 5ac70ea..729c539 100644 --- a/gpu-benchmarks/allreduce/run_frontier.sh +++ b/gpu-benchmarks/allreduce/run_frontier.sh @@ -23,11 +23,11 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log module reset # load modules echo loading modules: - module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4 + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 module load cray-mpich/8.1.30 - module load rocm/6.2.4 + module load rocm/6.1.3 - GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks + GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x NUM_TASKS=$(($NUM_NODES * 8)) MIN_MSG_SIZE=$((1 * 1024)) diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh index 4ffd5e8..d089dd1 100644 --- a/gpu-benchmarks/gemm/run_frontier.sh +++ b/gpu-benchmarks/gemm/run_frontier.sh @@ -21,11 +21,11 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log module reset # load modules echo loading modules: - module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4 + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 module load cray-mpich/8.1.30 - module load rocm/6.2.4 + module load rocm/6.1.3 - GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks + GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x NUM_TASKS=$(($NUM_NODES * 8)) From 56e0fd5a4b4f47b807fe6db75cfeb1ab4c5476a9 Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Sat, 28 Dec 2024 21:56:25 -0500 Subject: [PATCH 6/7] use gpu-bind=none for frontier --- AMG2023/run_frontier_16.sh | 1 + AMG2023/run_frontier_64.sh | 2 +- gpu-benchmarks/allgather/run_frontier.sh | 3 ++- gpu-benchmarks/allreduce/run_frontier.sh | 3 ++- gpu-benchmarks/gemm/run_frontier.sh | 3 ++- 5 files changed, 8 insertions(+), 4 deletions(-) diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh index d635c31..c0a69b0 100644 --- a/AMG2023/run_frontier_16.sh +++ b/AMG2023/run_frontier_16.sh @@ -3,6 +3,7 @@ #SBATCH -n 128 #SBATCH -q normal #SBATCH -J amg +#SBATCH --gpu-bind none #SBATCH -t 00:30:00 #SBATCH -A csc569 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/%x-%j/output-AMG2023.log diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh index 8854ca1..8baabe8 100644 --- a/AMG2023/run_frontier_64.sh +++ b/AMG2023/run_frontier_64.sh @@ -3,6 +3,7 @@ #SBATCH -n 512 #SBATCH -q normal #SBATCH -J amg +#SBATCH --gpu-bind none #SBATCH -t 00:30:00 #SBATCH -A csc569 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/%x-%j/output-AMG2023.log @@ -39,7 +40,6 @@ module load rocm/6.1.3 export MPICH_GPU_SUPPORT_ENABLED=1 export CRAY_ACCEL_TARGET=gfx90a export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ -export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so # mpiP export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH export MPIP="-o -f $OUTPUT_DIR" diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh index 75216e8..79cedc7 100644 --- a/gpu-benchmarks/allgather/run_frontier.sh +++ b/gpu-benchmarks/allgather/run_frontier.sh @@ -26,6 +26,7 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 module load cray-mpich/8.1.30 module load rocm/6.1.3 + module list GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x @@ -50,4 +51,4 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log echo $CMD $CMD echo end allgather: $(date) -} >> $OUTPUT_FILE +} &>> $OUTPUT_FILE diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh index 729c539..56bd2fe 100644 --- a/gpu-benchmarks/allreduce/run_frontier.sh +++ b/gpu-benchmarks/allreduce/run_frontier.sh @@ -26,6 +26,7 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 module load cray-mpich/8.1.30 module load rocm/6.1.3 + module list GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x @@ -45,4 +46,4 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log echo $CMD $CMD echo end allreduce: $(date) -} >> $OUTPUT_FILE +} &>> $OUTPUT_FILE diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh index d089dd1..9ccecbd 100644 --- a/gpu-benchmarks/gemm/run_frontier.sh +++ b/gpu-benchmarks/gemm/run_frontier.sh @@ -24,6 +24,7 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 module load cray-mpich/8.1.30 module load rocm/6.1.3 + module list GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x @@ -43,4 +44,4 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log echo $CMD $CMD echo end gemm: $(date) -} >> $OUTPUT_FILE +} &>> $OUTPUT_FILE From a087255bc2a22e152a5508f3103c88bfc7847ebc Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Mon, 13 Jan 2025 17:28:22 -0500 Subject: [PATCH 7/7] update gpu-benchmarks to specify ROCM version --- AMG2023/run_frontier_16.sh | 7 ++++--- AMG2023/run_frontier_64.sh | 7 ++++--- gpu-benchmarks/allgather/run_frontier.sh | 25 ++++++++++++++++-------- gpu-benchmarks/allreduce/run_frontier.sh | 25 ++++++++++++++++-------- gpu-benchmarks/gemm/run_frontier.sh | 25 ++++++++++++++++-------- 5 files changed, 59 insertions(+), 30 deletions(-) diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh index c0a69b0..c51b52d 100644 --- a/AMG2023/run_frontier_16.sh +++ b/AMG2023/run_frontier_16.sh @@ -17,13 +17,14 @@ ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log # Run gpu benchmarks COMM_TYPE=mpi +ROCM_VERSION=6.1.3 PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability echo running allreduce benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR # echo running allgather benchmark -# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR echo running gemm benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $OUTPUT_DIR +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR APP_ROOT=/ccs/home/keshprad/AMG2023 cd $APP_ROOT diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh index 8baabe8..c7a7a3e 100644 --- a/AMG2023/run_frontier_64.sh +++ b/AMG2023/run_frontier_64.sh @@ -17,13 +17,14 @@ ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log # Run gpu benchmarks COMM_TYPE=mpi +ROCM_VERSION=6.1.3 PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability echo running allreduce benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR # echo running allgather benchmark -# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR echo running gemm benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $OUTPUT_DIR +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR APP_ROOT=/ccs/home/keshprad/AMG2023 cd $APP_ROOT diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh index 79cedc7..7fc10b4 100644 --- a/gpu-benchmarks/allgather/run_frontier.sh +++ b/gpu-benchmarks/allgather/run_frontier.sh @@ -4,16 +4,25 @@ # run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allgather.sh #!/bin/bash -if [ "$#" -ne 3 ]; then - echo "Usage: $0 " +if [ "$#" -ne 4 ]; then + echo "Usage: $0 " exit 1 fi # `mpi` or `rccl` COMM_TYPE=$1 +# `5.7.1` or `6.1.3` +ROCM_VERSION=$2 # `16` or `64` -NUM_NODES=$2 +NUM_NODES=$3 # output directory -OUTPUT_DIR=$3 +OUTPUT_DIR=$4 + +# setup cray-mpich version +if [[ "$ROCM_VERSION" == "6.1.3" ]]; then + MPICH_VERSION=8.1.30 +else + MPICH_VERSION=8.1.28 +fi OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log @@ -23,13 +32,13 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log module reset # load modules echo loading modules: - module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 - module load cray-mpich/8.1.30 - module load rocm/6.1.3 + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION} + module load cray-mpich/${MPICH_VERSION} + module load rocm/${ROCM_VERSION} module list GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks - EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x + EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE\_rocm-${ROCM_VERSION}.x NUM_TASKS=$(($NUM_NODES * 8)) MIN_MSG_SIZE=$((1 * 1024)) MAX_MSG_SIZE=$((1 * 1024 * 1024)) diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh index 56bd2fe..855a486 100644 --- a/gpu-benchmarks/allreduce/run_frontier.sh +++ b/gpu-benchmarks/allreduce/run_frontier.sh @@ -4,16 +4,25 @@ # run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allreduce.sh #!/bin/bash -if [ "$#" -ne 3 ]; then - echo "Usage: $0 " +if [ "$#" -ne 4 ]; then + echo "Usage: $0 " exit 1 fi # `mpi` or `rccl` COMM_TYPE=$1 +# `5.7.1` or `6.1.3` +ROCM_VERSION=$2 # `16` or `64` -NUM_NODES=$2 +NUM_NODES=$3 # output directory -OUTPUT_DIR=$3 +OUTPUT_DIR=$4 + +# setup cray-mpich version +if [[ "$ROCM_VERSION" == "6.1.3" ]]; then + MPICH_VERSION=8.1.30 +else + MPICH_VERSION=8.1.28 +fi OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log @@ -23,13 +32,13 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log module reset # load modules echo loading modules: - module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 - module load cray-mpich/8.1.30 - module load rocm/6.1.3 + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION} + module load cray-mpich/${MPICH_VERSION} + module load rocm/${ROCM_VERSION} module list GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks - EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x + EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE\_rocm-${ROCM_VERSION}.x NUM_TASKS=$(($NUM_NODES * 8)) MIN_MSG_SIZE=$((1 * 1024)) MAX_MSG_SIZE=$((1 * 1024 * 1024)) diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh index 9ccecbd..c5348be 100644 --- a/gpu-benchmarks/gemm/run_frontier.sh +++ b/gpu-benchmarks/gemm/run_frontier.sh @@ -4,14 +4,23 @@ # run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/gemm.sh #!/bin/bash -if [ "$#" -ne 2 ]; then - echo "Usage: $0 " +if [ "$#" -ne 3 ]; then + echo "Usage: $0 " exit 1 fi +# `5.7.1` or `6.1.3` +ROCM_VERSION=$1 # `16` or `64` -NUM_NODES=$1 +NUM_NODES=$2 # output directory -OUTPUT_DIR=$2 +OUTPUT_DIR=$3 + +# setup cray-mpich version +if [[ "$ROCM_VERSION" == "6.1.3" ]]; then + MPICH_VERSION=8.1.30 +else + MPICH_VERSION=8.1.28 +fi OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log @@ -21,13 +30,13 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log module reset # load modules echo loading modules: - module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 - module load cray-mpich/8.1.30 - module load rocm/6.1.3 + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION} + module load cray-mpich/${MPICH_VERSION} + module load rocm/${ROCM_VERSION} module list GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks - EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x + EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm_rocm-${ROCM_VERSION}.x NUM_TASKS=$(($NUM_NODES * 8)) export MPICH_GPU_SUPPORT_ENABLED=1