From ca42dbd33517c93b924609833ce4d92fc32b7ad2 Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Wed, 4 Dec 2024 14:46:33 -0500 Subject: [PATCH 01/13] add frontier install instrs for AMG2023 --- AMG2023/README.md | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/AMG2023/README.md b/AMG2023/README.md index 476ad56..71af9f6 100644 --- a/AMG2023/README.md +++ b/AMG2023/README.md @@ -51,4 +51,47 @@ Repository: [AMG2023](https://github.com/pssg-int/AMG2023) ``` ## Frontier Installation +1. Load modules + ```sh + module reset + + module load cray-mpich/8.1.28 + module load craype-accel-amd-gfx90a + module load rocm + export MPICH_GPU_SUPPORT_ENABLED=1 + + # load compatible cmake version + module load Core/24.07 + module load cmake/3.27.9 + ``` +2. Configure hypre + - Clone hypre v2.27.0 and navigate to src: + ```sh + git clone -b v2.27.0 https://github.com/hypre-space/hypre.git + cd into ~/hypre/src + ``` + - Configure hypre (in hypre/src) + ```sh + ./configure --with-hip --with-gpu-arch=gfx90a --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" --with-MPI-include="${MPICH_DIR}/include" + ``` + - Compile hypre (in hypre/src) + ```sh + # build with make + make + ``` +3. Configure AMG2023 + - Clone repo: + ```sh + git clone https://github.com/pssg-int/AMG2023` + cd AMG2023 + ``` + - Configure cmake + ```sh + mkdir build && cd build + cmake .. -DHYPRE_PREFIX=/ccs/home/keshprad/hypre/src/hypre/ -DCMAKE_EXE_LINKER_FLAGS="-lrocsparse -lrocrand" + ``` + - Compile AMG2023 (in AMG2023/build) + ```sh + make install + ``` From 28abeca38d9f9396e89d5ef2091720a371f5f4f8 Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Tue, 17 Dec 2024 18:06:04 -0500 Subject: [PATCH 02/13] frontier scripts for AMG and for gpu benchmarks --- AMG2023/run_frontier_16.sh | 56 ++++++++++++++++++++++++ AMG2023/run_frontier_64.sh | 56 ++++++++++++++++++++++++ AMG2023/run_frontier_crontab.sh | 19 ++++++++ gpu-benchmarks/allgather/run_frontier.sh | 51 +++++++++++++++++++++ gpu-benchmarks/allreduce/run_frontier.sh | 46 +++++++++++++++++++ gpu-benchmarks/gemm/run_frontier.sh | 44 +++++++++++++++++++ 6 files changed, 272 insertions(+) create mode 100644 AMG2023/run_frontier_16.sh create mode 100644 AMG2023/run_frontier_64.sh create mode 100644 AMG2023/run_frontier_crontab.sh create mode 100644 gpu-benchmarks/allgather/run_frontier.sh create mode 100644 gpu-benchmarks/allreduce/run_frontier.sh create mode 100644 gpu-benchmarks/gemm/run_frontier.sh diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh new file mode 100644 index 0000000..8546887 --- /dev/null +++ b/AMG2023/run_frontier_16.sh @@ -0,0 +1,56 @@ +#!/bin/bash +#SBATCH -N 16 +#SBATCH -n 128 +#SBATCH -q normal +#SBATCH -J amg +#SBATCH -t 00:30:00 +#SBATCH -A csc569 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/%x-%j/output-AMG2023.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/%x-%j/error-AMG2023.log +#SBATCH --exclusive +# Run like: sbatch run_frontier_16.sh + +OUTPUT_DIR=/lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +OUTPUT_FILE=$OUTPUT_DIR/output-AMG2023.log +ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log + +# Run gpu benchmarks +COMM_TYPE=mpi +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $OUTPUT_DIR + +APP_ROOT=/ccs/home/keshprad/AMG2023 +cd $APP_ROOT + +# reset modules +echo resetting modules: +module reset +# load modules +echo loading modules: +module load cray-mpich/8.1.28 +module load craype-accel-amd-gfx90a +module load rocm + +export MPICH_GPU_SUPPORT_ENABLED=1 +export CRAY_ACCEL_TARGET=gfx90a +export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ +export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so +export MPIP="-f $OUTPUT_DIR" + +# log start date +echo start AMG2023: $(date) +# define command +cmd="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \ + --output $OUTPUT_FILE \ + --error $ERROR_FILE \ + ./build/amg -P 4 4 8 -n 128 64 64 -problem 1 -iter 500" +echo solving: +echo $cmd +$cmd +# log end date +echo end AMG2023: $(date) diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh new file mode 100644 index 0000000..c28de6a --- /dev/null +++ b/AMG2023/run_frontier_64.sh @@ -0,0 +1,56 @@ +#!/bin/bash +#SBATCH -N 64 +#SBATCH -n 512 +#SBATCH -q normal +#SBATCH -J amg +#SBATCH -t 00:30:00 +#SBATCH -A csc569 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/%x-%j/output-AMG2023.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/%x-%j/error-AMG2023.log +#SBATCH --exclusive +# Run like: sbatch run_frontier_64.sh + +OUTPUT_DIR=/lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +OUTPUT_FILE=$OUTPUT_DIR/output-AMG2023.log +ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log + +# Run gpu benchmarks +COMM_TYPE=mpi +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $OUTPUT_DIR + +APP_ROOT=/ccs/home/keshprad/AMG2023 +cd $APP_ROOT + +# reset modules +echo resetting modules: +module reset +# load modules +echo loading modules: +module load cray-mpich/8.1.28 +module load craype-accel-amd-gfx90a +module load rocm + +export MPICH_GPU_SUPPORT_ENABLED=1 +export CRAY_ACCEL_TARGET=gfx90a +export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ +export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so +export MPIP="-f $OUTPUT_DIR" + +# log start date +echo start AMG2023: $(date) +# define command +cmd="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \ + --output $OUTPUT_FILE \ + --error $ERROR_FILE \ + ./build/amg -P 8 8 8 -n 128 64 64 -problem 1 -iter 500" +echo solving: +echo $cmd +$cmd +# log end date +echo end AMG2023: $(date) diff --git a/AMG2023/run_frontier_crontab.sh b/AMG2023/run_frontier_crontab.sh new file mode 100644 index 0000000..09b0f66 --- /dev/null +++ b/AMG2023/run_frontier_crontab.sh @@ -0,0 +1,19 @@ +#!/bin/bash +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi +# `16` or `64` +NUM_NODES=$1 + +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability + +# load lmod +source /usr/share/lmod/lmod/init/bash +# load default LMOD_SYSTEM_DEFAULT_MODULES and MODULEPATH +export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-trento:craype-network-ofi:perftools-base:xpmem:cray-pmi:PrgEnv-cray:DefApps +export MODULEPATH=/sw/frontier/spack-envs/modules/cce/17.0.0/cray-mpich-8.1.28/cce-17.0.0:/sw/frontier/spack-envs/modules/cce/17.0.0/cce-17.0.0:/sw/frontier/spack-envs/modules/Core/24.07:/opt/cray/pe/lmod/modulefiles/mpi/crayclang/17.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/crayclang/17.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/compiler/crayclang/17.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-trento/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/sw/frontier/modulefiles:/opt/cray/modulefiles + +# run sbatch script +script=$PERF_VARIABILITY_ROOT/AMG2023/run_frontier_$NUM_NODES\.sh +sbatch $script \ No newline at end of file diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh new file mode 100644 index 0000000..dfd7bfe --- /dev/null +++ b/gpu-benchmarks/allgather/run_frontier.sh @@ -0,0 +1,51 @@ +# This script assumes it is being run by another sbatch script, +# so does not include portions for SBATCH vars (e.g. account, time, etc.) + +# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allgather.sh + +#!/bin/bash +if [ "$#" -ne 3 ]; then + echo "Usage: $0 " + exit 1 +fi +# `mpi` or `rccl` +COMM_TYPE=$1 +# `16` or `64` +NUM_NODES=$2 +# output directory +OUTPUT_DIR=$3 + +OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log + +{ + # reset modules + echo resetting modules: + module reset + # load modules + echo loading modules: + module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm + + GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks + EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x + NUM_TASKS=$(($NUM_NODES * 8)) + MIN_MSG_SIZE=$((1 * 1024)) + MAX_MSG_SIZE=$((1 * 1024 * 1024)) + ITERATIONS=100 + + export MPICH_GPU_SUPPORT_ENABLED=1 + export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}" + + echo start allgather: $(date) + For MPI-bench we should use --gpus-per-node --gpus-per-task --ntasks-per-node , and --gpu-bind=none in srun. + CMD="srun -N $NUM_NODES -n $NUM_TASKS \ + --gpus-per-node 8 \ + --gpus-per-task 1 \ + --ntasks-per-node 8 \ + --gpu-bind none \ + --output $OUTPUT_FILE \ + $EXEC $NUM_TASKS $MIN_MSG_SIZE $MAX_MSG_SIZE $ITERATIONS" + echo running: + echo $CMD + $CMD + echo end allgather: $(date) +} >> $OUTPUT_FILE diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh new file mode 100644 index 0000000..caafc1a --- /dev/null +++ b/gpu-benchmarks/allreduce/run_frontier.sh @@ -0,0 +1,46 @@ +# This script assumes it is being run by another sbatch script, +# so does not include portions for SBATCH vars (e.g. account, time, etc.) + +# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allreduce.sh + +#!/bin/bash +if [ "$#" -ne 3 ]; then + echo "Usage: $0 " + exit 1 +fi +# `mpi` or `rccl` +COMM_TYPE=$1 +# `16` or `64` +NUM_NODES=$2 +# output directory +OUTPUT_DIR=$3 + +OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log + +{ + # reset modules + echo resetting modules: + module reset + # load modules + echo loading modules: + module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm + + GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks + EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x + NUM_TASKS=$(($NUM_NODES * 8)) + MIN_MSG_SIZE=$((1 * 1024)) + MAX_MSG_SIZE=$((1 * 1024 * 1024)) + ITERATIONS=100 + + export MPICH_GPU_SUPPORT_ENABLED=1 + export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}" + + echo start allreduce: $(date) + CMD="srun -N $NUM_NODES -n $NUM_TASKS \ + --output $OUTPUT_FILE \ + $EXEC $NUM_TASKS $MIN_MSG_SIZE $MAX_MSG_SIZE $ITERATIONS" + echo running: + echo $CMD + $CMD + echo end allreduce: $(date) +} >> $OUTPUT_FILE diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh new file mode 100644 index 0000000..6f9bb5b --- /dev/null +++ b/gpu-benchmarks/gemm/run_frontier.sh @@ -0,0 +1,44 @@ +# This script assumes it is being run by another sbatch script, +# so does not include portions for SBATCH vars (e.g. account, time, etc.) + +# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/gemm.sh + +#!/bin/bash +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi +# `16` or `64` +NUM_NODES=$1 +# output directory +OUTPUT_DIR=$2 + +OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log + +{ + # reset modules + echo resetting modules: + module reset + # load modules + echo loading modules: + module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm + + GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks + EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x + NUM_TASKS=$(($NUM_NODES * 8)) + + export MPICH_GPU_SUPPORT_ENABLED=1 + export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}" + + echo start gemm: $(date) + CMD="srun -N $NUM_NODES -n $NUM_TASKS \ + --gpus-per-node 8 \ + --gpus-per-task 1 \ + --ntasks-per-node 8 \ + --output $OUTPUT_FILE \ + $EXEC" + echo running: + echo $CMD + $CMD + echo end gemm: $(date) +} >> $OUTPUT_FILE From d59c821dd5b11603f99e984df12cdb7cf00f8c24 Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Wed, 18 Dec 2024 02:18:39 -0500 Subject: [PATCH 03/13] reformat readme --- AMG2023/README.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/AMG2023/README.md b/AMG2023/README.md index 71af9f6..3e9b90e 100644 --- a/AMG2023/README.md +++ b/AMG2023/README.md @@ -1,9 +1,9 @@ # AMG2023 README For more detailed installation parameters, please refer to the [installation document](https://github.com/pssg-int/AMG2023/blob/main/amg-doc.pdf). -## Perlmutter Compilation +Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/) -Repository: [AMG2023](https://github.com/pssg-int/AMG2023) +## Perlmutter Compilation ### Steps to Compile @@ -50,7 +50,10 @@ Repository: [AMG2023](https://github.com/pssg-int/AMG2023) cmake -DHYPRE_PREFIX=/pscratch/sd/c/cunyang/AMG2023 .. ``` -## Frontier Installation +## Frontier Compilation + +### Steps to Compile + 1. Load modules ```sh module reset From c76505da4eeba3de36130bf3345d88cd4236aaad Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Wed, 25 Dec 2024 01:05:56 -0500 Subject: [PATCH 04/13] update AMG2023 and gpu-benchmarks scripts to use newest rocm and cray-mpich versions available on frontier --- AMG2023/README.md | 15 +++++++++------ AMG2023/run_frontier_16.sh | 11 ++++------- AMG2023/run_frontier_64.sh | 10 ++++------ gpu-benchmarks/allgather/run_frontier.sh | 4 +++- gpu-benchmarks/allreduce/run_frontier.sh | 4 +++- gpu-benchmarks/gemm/run_frontier.sh | 4 +++- 6 files changed, 26 insertions(+), 22 deletions(-) diff --git a/AMG2023/README.md b/AMG2023/README.md index 3e9b90e..03832f1 100644 --- a/AMG2023/README.md +++ b/AMG2023/README.md @@ -58,24 +58,27 @@ Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/) ```sh module reset - module load cray-mpich/8.1.28 + module load cray-mpich/8.1.30 module load craype-accel-amd-gfx90a - module load rocm + module load rocm/6.2.4 export MPICH_GPU_SUPPORT_ENABLED=1 # load compatible cmake version module load Core/24.07 module load cmake/3.27.9 ``` -2. Configure hypre - - Clone hypre v2.27.0 and navigate to src: +2. Configure hypre (v2.32.0) + - Clone hypre v2.32.0 and navigate to src: ```sh - git clone -b v2.27.0 https://github.com/hypre-space/hypre.git + git clone -b v2.32.0 https://github.com/hypre-space/hypre.git cd into ~/hypre/src ``` - Configure hypre (in hypre/src) ```sh - ./configure --with-hip --with-gpu-arch=gfx90a --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" --with-MPI-include="${MPICH_DIR}/include" + ./configure --with-hip --enable-device-memory-pool --enable-mixedint --with-gpu-arch=gfx90a \ + --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" \ + --with-MPI-include="${MPICH_DIR}/include" \ + --with-extra-CUFLAGS="-I/opt/rocm-6.2.4/include -I/opt/rocm-6.2.4/include/rocsparse -L/opt/rocm-6.2.4/lib" ``` - Compile hypre (in hypre/src) ```sh diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh index 8546887..92664c3 100644 --- a/AMG2023/run_frontier_16.sh +++ b/AMG2023/run_frontier_16.sh @@ -32,22 +32,19 @@ echo resetting modules: module reset # load modules echo loading modules: -module load cray-mpich/8.1.28 +module load cray-mpich/8.1.30 module load craype-accel-amd-gfx90a -module load rocm +module load rocm/6.2.4 export MPICH_GPU_SUPPORT_ENABLED=1 export CRAY_ACCEL_TARGET=gfx90a export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ -export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so -export MPIP="-f $OUTPUT_DIR" +export MPIP="-o -f $OUTPUT_DIR" # log start date echo start AMG2023: $(date) # define command -cmd="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \ - --output $OUTPUT_FILE \ - --error $ERROR_FILE \ +cmd="srun --output $OUTPUT_FILE --error $ERROR_FILE \ ./build/amg -P 4 4 8 -n 128 64 64 -problem 1 -iter 500" echo solving: echo $cmd diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh index c28de6a..eb4c6d9 100644 --- a/AMG2023/run_frontier_64.sh +++ b/AMG2023/run_frontier_64.sh @@ -32,22 +32,20 @@ echo resetting modules: module reset # load modules echo loading modules: -module load cray-mpich/8.1.28 +module load cray-mpich/8.1.30 module load craype-accel-amd-gfx90a -module load rocm +module load rocm/6.2.4 export MPICH_GPU_SUPPORT_ENABLED=1 export CRAY_ACCEL_TARGET=gfx90a export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so -export MPIP="-f $OUTPUT_DIR" +export MPIP="-o -f $OUTPUT_DIR" # log start date echo start AMG2023: $(date) # define command -cmd="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \ - --output $OUTPUT_FILE \ - --error $ERROR_FILE \ +cmd="srun --output $OUTPUT_FILE --error $ERROR_FILE \ ./build/amg -P 8 8 8 -n 128 64 64 -problem 1 -iter 500" echo solving: echo $cmd diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh index dfd7bfe..cb98dd6 100644 --- a/gpu-benchmarks/allgather/run_frontier.sh +++ b/gpu-benchmarks/allgather/run_frontier.sh @@ -23,7 +23,9 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log module reset # load modules echo loading modules: - module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4 + module load cray-mpich/8.1.30 + module load rocm/6.2.4 GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh index caafc1a..5ac70ea 100644 --- a/gpu-benchmarks/allreduce/run_frontier.sh +++ b/gpu-benchmarks/allreduce/run_frontier.sh @@ -23,7 +23,9 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log module reset # load modules echo loading modules: - module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4 + module load cray-mpich/8.1.30 + module load rocm/6.2.4 GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh index 6f9bb5b..4ffd5e8 100644 --- a/gpu-benchmarks/gemm/run_frontier.sh +++ b/gpu-benchmarks/gemm/run_frontier.sh @@ -21,7 +21,9 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log module reset # load modules echo loading modules: - module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4 + module load cray-mpich/8.1.30 + module load rocm/6.2.4 GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x From 3d75c0d02d51fa80cf1888a84311c8cb3e5c2a3d Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Wed, 25 Dec 2024 23:54:08 -0500 Subject: [PATCH 05/13] nanogpt scripts --- nanoGPT/README.md | 73 ++++++++++++++------- nanoGPT/run_frontier16.sh | 90 ++++++++++++++++++++++++++ nanoGPT/run_frontier64.sh | 90 ++++++++++++++++++++++++++ nanoGPT/run_frontier_crontab.sh | 19 ++++++ nanoGPT/train_gpt_neox_20B_frontier.py | 46 +++++++++++++ nanoGPT/train_gpt_neox_5B_frontier.py | 46 +++++++++++++ 6 files changed, 342 insertions(+), 22 deletions(-) create mode 100644 nanoGPT/run_frontier16.sh create mode 100644 nanoGPT/run_frontier64.sh create mode 100644 nanoGPT/run_frontier_crontab.sh create mode 100644 nanoGPT/train_gpt_neox_20B_frontier.py create mode 100644 nanoGPT/train_gpt_neox_5B_frontier.py diff --git a/nanoGPT/README.md b/nanoGPT/README.md index 5c499fc..87e8189 100644 --- a/nanoGPT/README.md +++ b/nanoGPT/README.md @@ -1,33 +1,62 @@ -# nanoGPT Setup Instructions +# nanoGPT README +For more detailed installation parameters, please refer to [nanoGPT install guide](https://github.com/axonn-ai/nanoGPT). -## Clone the Repository +Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/) -```sh -git clone https://github.com/axonn-ai/nanoGPT.git -``` -## Create Python Environment +## Perlmutter Setup -```sh -./scripts/create_python_env_perlmutter.sh -``` +### Setup steps -> Note: You may need to modify the path and torch version in `create_python_env_perlmutter.sh`. +1. Clone the Repository + ```sh + git clone https://github.com/axonn-ai/nanoGPT.git + cd nanoGPT + ``` -## Load PyTorch Module +2. Create Python Environment + ```sh + ./scripts/create_python_env_perlmutter.sh + ``` + > Note: You may need to modify the path and torch version in `create_python_env_perlmutter.sh`. -```sh -module load pytorch/2.0.1 -``` +3. Load PyTorch Module + ```sh + module load pytorch/2.0.1 + ``` -## Activate the Environment +4. Activate the Environment + ```sh + source path_to_nanogptENV/bin/activate + ``` -```sh -source path_to_nanogptENV/bin/activate -``` +5. Download Data + ```sh + python nanoGPT/data/openwebtext/prepare.py + ``` -## Download Data +## Frontier Setup -```sh -python nanoGPT/data/openwebtext/prepare.py -``` \ No newline at end of file +### Setup steps + +1. Clone the Repository + ```sh + git clone https://github.com/axonn-ai/nanoGPT.git + cd nanoGPT + ``` + +2. Create Python Environment + ```sh + ./scripts/create_python_env_frontier.sh + ``` + > Note: You may need to modify the WKSPC path and torch version in `create_python_env_frontier.sh`. + +4. Activate the Environment + ```sh + source path_to_nanogptENV/bin/activate + ``` + +5. Download Data + ```sh + python data/openwebtext/prepare.py + ``` \ No newline at end of file diff --git a/nanoGPT/run_frontier16.sh b/nanoGPT/run_frontier16.sh new file mode 100644 index 0000000..63718c5 --- /dev/null +++ b/nanoGPT/run_frontier16.sh @@ -0,0 +1,90 @@ +#!/bin/bash +#SBATCH -N 16 +#SBATCH -n 128 +#SBATCH -q normal +#SBATCH -J nanogpt +#SBATCH -t 01:00:00 +#SBATCH -A csc569 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-output.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-error.log +#SBATCH --exclusive +# Run like: sbatch run_frontier16.sh + +export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log +ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log + +# Run gpu benchmarks +COMM_TYPE=rccl +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH + +APP_ROOT=/lustre/orion/csc569/scratch/keshprad/nanoGPT +cd $APP_ROOT + +export SCRATCH="/lustre/orion/csc569/scratch/keshprad" +export WRKSPC="${SCRATCH}/nanoGPT" +export HF_HOME="${SCRATCH}/.cache/hf" +export HF_TRANSFORMERS_CACHE="${HF_HOME}" +export HF_DATASETS_CACHE="${HF_HOME}/datasets" +cd $WRKSPC + +# load modules +rocm_version=6.1.3 +module reset +module load PrgEnv-gnu/8.5.0 +module load rocm/${rocm_version} +module load craype-accel-amd-gfx90a +module load cray-python/3.9.13.1 +module load gcc-native/12.3 +module load cray-mpich/8.1.30 +# activate env +source ${WRKSPC}/axonn_nanogpt/bin/activate + +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 8 )) +## master addr and port +export MASTER_ADDR=$(hostname -i) +export MASTER_PORT=3442 +export WORLD_SIZE=${GPUS} + +## nccl env vars to speedup stuff +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NCCL_NET_GDR_LEVEL=PHB +export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0 +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn0 +export NCCL_NET="AWS Libfabric" +export NCCL_TIMEOUT=1200 +export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=1200 +export MPICH_GPU_SUPPORT_ENABLED=0 +# AWS-OFI-RCCL +export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH" + +SCRIPT="train_frontier.py config/train_gpt_neox_5B.py" + +# run without profiler +export WITH_PROFILER=0 +# log start date +echo start nanoGPT_withoutprof: $(date) +run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT_withoutprof.log" +echo $run_cmd +eval $run_cmd +# log end date +echo end nanoGPT_withoutprof: $(date) + + +# run with profiler +export WITH_PROFILER=1 +# log start date +echo start nanoGPT: $(date) +run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT.log" +echo $run_cmd +eval $run_cmd +# log end date +echo end nanoGPT: $(date) diff --git a/nanoGPT/run_frontier64.sh b/nanoGPT/run_frontier64.sh new file mode 100644 index 0000000..1c9a75b --- /dev/null +++ b/nanoGPT/run_frontier64.sh @@ -0,0 +1,90 @@ +#!/bin/bash +#SBATCH -N 64 +#SBATCH -n 512 +#SBATCH -q normal +#SBATCH -J nanogpt +#SBATCH -t 01:00:00 +#SBATCH -A csc569 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-output.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-error.log +#SBATCH --exclusive +# Run like: sbatch run_frontier64.sh + +export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log +ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log + +# Run gpu benchmarks +COMM_TYPE=rccl +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH + +APP_ROOT=/lustre/orion/csc569/scratch/keshprad/nanoGPT +cd $APP_ROOT + +export SCRATCH="/lustre/orion/csc569/scratch/keshprad" +export WRKSPC="${SCRATCH}/nanoGPT" +export HF_HOME="${SCRATCH}/.cache/hf" +export HF_TRANSFORMERS_CACHE="${HF_HOME}" +export HF_DATASETS_CACHE="${HF_HOME}/datasets" +cd $WRKSPC + +# load modules +rocm_version=6.1.3 +module reset +module load PrgEnv-gnu/8.5.0 +module load rocm/${rocm_version} +module load craype-accel-amd-gfx90a +module load cray-python/3.9.13.1 +module load gcc-native/12.3 +module load cray-mpich/8.1.30 +# activate env +source ${WRKSPC}/axonn_nanogpt/bin/activate + +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 8 )) +## master addr and port +export MASTER_ADDR=$(hostname -i) +export MASTER_PORT=3442 +export WORLD_SIZE=${GPUS} + +## nccl env vars to speedup stuff +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NCCL_NET_GDR_LEVEL=PHB +export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0 +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn0 +export NCCL_NET="AWS Libfabric" +export NCCL_TIMEOUT=1200 +export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=1200 +export MPICH_GPU_SUPPORT_ENABLED=0 +# AWS-OFI-RCCL +export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH" + +SCRIPT="train_frontier.py config/train_gpt_neox_20B.py" + +# run without profiler +export WITH_PROFILER=0 +# log start date +echo start nanoGPT_withoutprof: $(date) +run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT_withoutprof.log" +echo $run_cmd +eval $run_cmd +# log end date +echo end nanoGPT_withoutprof: $(date) + + +# run with profiler +export WITH_PROFILER=1 +# log start date +echo start nanoGPT: $(date) +run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT.log" +echo $run_cmd +eval $run_cmd +# log end date +echo end nanoGPT: $(date) diff --git a/nanoGPT/run_frontier_crontab.sh b/nanoGPT/run_frontier_crontab.sh new file mode 100644 index 0000000..dcc8cf5 --- /dev/null +++ b/nanoGPT/run_frontier_crontab.sh @@ -0,0 +1,19 @@ +#!/bin/bash +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi +# `16` or `64` +NUM_NODES=$1 + +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability + +# load lmod +source /usr/share/lmod/lmod/init/bash +# load default LMOD_SYSTEM_DEFAULT_MODULES and MODULEPATH +export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-trento:craype-network-ofi:perftools-base:xpmem:cray-pmi:PrgEnv-cray:DefApps +export MODULEPATH=/sw/frontier/spack-envs/modules/cce/17.0.0/cray-mpich-8.1.28/cce-17.0.0:/sw/frontier/spack-envs/modules/cce/17.0.0/cce-17.0.0:/sw/frontier/spack-envs/modules/Core/24.07:/opt/cray/pe/lmod/modulefiles/mpi/crayclang/17.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/crayclang/17.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/compiler/crayclang/17.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-trento/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/sw/frontier/modulefiles:/opt/cray/modulefiles + +# run sbatch script +script=$PERF_VARIABILITY_ROOT/nanoGPT/run_frontier$NUM_NODES\.sh +sbatch $script \ No newline at end of file diff --git a/nanoGPT/train_gpt_neox_20B_frontier.py b/nanoGPT/train_gpt_neox_20B_frontier.py new file mode 100644 index 0000000..cf7b91f --- /dev/null +++ b/nanoGPT/train_gpt_neox_20B_frontier.py @@ -0,0 +1,46 @@ +# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB +# launch as the following (e.g. in a screen session) and wait ~5 days: +# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py + +wandb_log = False +wandb_project = 'owt' +wandb_run_name='gpt2-124M' + +# these make the total batch size be ~0.5M +# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520 +batch_size = 8 +block_size = 512 +gradient_accumulation_steps = 1 * 512 #per_gpu x num_gpus + +# model +n_layer = 32 +n_head = 56 +n_embd = 7168 +dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+ +bias = False # do we use bias inside LayerNorm and Linear layers? + +# adamw optimizer +learning_rate = 1e-4 # max learning rate +max_iters = 30 # total number of training iterations + +# axonn params +G_intra_d=16 +G_intra_c=1 +G_intra_r=1 +compile=False # disable compile for axonn +gradient_checkpointing=True + +# this makes total number of tokens be 300B +max_iters = 30 +lr_decay_iters = 600000 + +# eval stuff +eval_interval = 1000 +eval_iters = 1 +log_interval = 10 + +# weight decay +weight_decay = 1e-1 + +# log every iteration +log_interval=1 \ No newline at end of file diff --git a/nanoGPT/train_gpt_neox_5B_frontier.py b/nanoGPT/train_gpt_neox_5B_frontier.py new file mode 100644 index 0000000..5fcc430 --- /dev/null +++ b/nanoGPT/train_gpt_neox_5B_frontier.py @@ -0,0 +1,46 @@ +# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB +# launch as the following (e.g. in a screen session) and wait ~5 days: +# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py + +wandb_log = False +wandb_project = 'owt' +wandb_run_name='gpt2-124M' + +# these make the total batch size be ~0.5M +# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520 +batch_size = 32 +block_size = 512 +gradient_accumulation_steps = 1 * 128 #per_gpu x num_gpus + +# model +n_layer = 24 +n_head = 32 +n_embd = 4096 +dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+ +bias = False # do we use bias inside LayerNorm and Linear layers? + +# adamw optimizer +learning_rate = 1e-4 # max learning rate +max_iters = 30 # total number of training iterations + +# axonn params +G_intra_d=16 +G_intra_c=1 +G_intra_r=1 +compile=False # disable compile for axonn +gradient_checkpointing=True + +# this makes total number of tokens be 300B +max_iters = 30 +lr_decay_iters = 600000 + +# eval stuff +eval_interval = 1000 +eval_iters = 1 +log_interval = 10 + +# weight decay +weight_decay = 1e-1 + +# log every iteration +log_interval=1 \ No newline at end of file From 7e8749e901dfdec0a55ad4d6b15a10816cc837a6 Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Fri, 27 Dec 2024 04:45:42 -0500 Subject: [PATCH 06/13] updated AMG2023 and gpu-benchmarks run scripts --- AMG2023/README.md | 17 ++++++++++++----- AMG2023/run_frontier_16.sh | 4 +++- AMG2023/run_frontier_64.sh | 4 +++- gpu-benchmarks/README.md | 14 ++++++++++++++ gpu-benchmarks/allgather/run_frontier.sh | 6 +++--- gpu-benchmarks/allreduce/run_frontier.sh | 6 +++--- gpu-benchmarks/gemm/run_frontier.sh | 6 +++--- 7 files changed, 41 insertions(+), 16 deletions(-) create mode 100644 gpu-benchmarks/README.md diff --git a/AMG2023/README.md b/AMG2023/README.md index 03832f1..14c75c8 100644 --- a/AMG2023/README.md +++ b/AMG2023/README.md @@ -60,7 +60,7 @@ Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/) module load cray-mpich/8.1.30 module load craype-accel-amd-gfx90a - module load rocm/6.2.4 + module load rocm/6.1.3 export MPICH_GPU_SUPPORT_ENABLED=1 # load compatible cmake version @@ -76,9 +76,10 @@ Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/) - Configure hypre (in hypre/src) ```sh ./configure --with-hip --enable-device-memory-pool --enable-mixedint --with-gpu-arch=gfx90a \ - --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" \ - --with-MPI-include="${MPICH_DIR}/include" \ - --with-extra-CUFLAGS="-I/opt/rocm-6.2.4/include -I/opt/rocm-6.2.4/include/rocsparse -L/opt/rocm-6.2.4/lib" + --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" \ + --with-MPI-include="${MPICH_DIR}/include" \ + CFLAGS="-I${ROCM_PATH}/include/ -I${ROCM_PATH}/llvm/include/ -I${ROCM_PATH}/include/rocsparse/" \ + LDFLAGS="-L${ROCM_PATH}/lib/ -L${ROCM_PATH}/llvm/lib/ -lrocsparse" ``` - Compile hypre (in hypre/src) ```sh @@ -91,11 +92,17 @@ Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/) git clone https://github.com/pssg-int/AMG2023` cd AMG2023 ``` + - Add mpiP to LD_LIBRARY_PATH + ```sh + export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH + ``` - Configure cmake ```sh mkdir build && cd build - cmake .. -DHYPRE_PREFIX=/ccs/home/keshprad/hypre/src/hypre/ -DCMAKE_EXE_LINKER_FLAGS="-lrocsparse -lrocrand" + cmake .. -DHYPRE_PREFIX=/ccs/home/keshprad/hypre/src/hypre/ \ + -DCMAKE_C_FLAGS="-I${ROCM_PATH}/include/ -I${ROCM_PATH}/llvm/include/ -I${ROCM_PATH}/include/rocsparse/" \ + -DCMAKE_EXE_LINKER_FLAGS="-L${ROCM_PATH}/lib/ -L${ROCM_PATH}/llvm/lib/ -lrocsparse -lrocrand" ``` - Compile AMG2023 (in AMG2023/build) ```sh diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh index 92664c3..d635c31 100644 --- a/AMG2023/run_frontier_16.sh +++ b/AMG2023/run_frontier_16.sh @@ -34,11 +34,13 @@ module reset echo loading modules: module load cray-mpich/8.1.30 module load craype-accel-amd-gfx90a -module load rocm/6.2.4 +module load rocm/6.1.3 export MPICH_GPU_SUPPORT_ENABLED=1 export CRAY_ACCEL_TARGET=gfx90a export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ +# mpiP +export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH export MPIP="-o -f $OUTPUT_DIR" # log start date diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh index eb4c6d9..8854ca1 100644 --- a/AMG2023/run_frontier_64.sh +++ b/AMG2023/run_frontier_64.sh @@ -34,12 +34,14 @@ module reset echo loading modules: module load cray-mpich/8.1.30 module load craype-accel-amd-gfx90a -module load rocm/6.2.4 +module load rocm/6.1.3 export MPICH_GPU_SUPPORT_ENABLED=1 export CRAY_ACCEL_TARGET=gfx90a export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so +# mpiP +export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH export MPIP="-o -f $OUTPUT_DIR" # log start date diff --git a/gpu-benchmarks/README.md b/gpu-benchmarks/README.md new file mode 100644 index 0000000..c8f9c25 --- /dev/null +++ b/gpu-benchmarks/README.md @@ -0,0 +1,14 @@ +# gpu-benchmarks README +Code Repository: [gpu-benchmarks](#TODO:) + +## Perlmutter Compilation + +### Steps to Compile + +TODO: + +## Frontier Compilation + +### Steps to Compile + +TODO: \ No newline at end of file diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh index cb98dd6..75216e8 100644 --- a/gpu-benchmarks/allgather/run_frontier.sh +++ b/gpu-benchmarks/allgather/run_frontier.sh @@ -23,11 +23,11 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log module reset # load modules echo loading modules: - module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4 + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 module load cray-mpich/8.1.30 - module load rocm/6.2.4 + module load rocm/6.1.3 - GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks + GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x NUM_TASKS=$(($NUM_NODES * 8)) MIN_MSG_SIZE=$((1 * 1024)) diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh index 5ac70ea..729c539 100644 --- a/gpu-benchmarks/allreduce/run_frontier.sh +++ b/gpu-benchmarks/allreduce/run_frontier.sh @@ -23,11 +23,11 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log module reset # load modules echo loading modules: - module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4 + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 module load cray-mpich/8.1.30 - module load rocm/6.2.4 + module load rocm/6.1.3 - GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks + GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x NUM_TASKS=$(($NUM_NODES * 8)) MIN_MSG_SIZE=$((1 * 1024)) diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh index 4ffd5e8..d089dd1 100644 --- a/gpu-benchmarks/gemm/run_frontier.sh +++ b/gpu-benchmarks/gemm/run_frontier.sh @@ -21,11 +21,11 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log module reset # load modules echo loading modules: - module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4 + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 module load cray-mpich/8.1.30 - module load rocm/6.2.4 + module load rocm/6.1.3 - GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks + GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x NUM_TASKS=$(($NUM_NODES * 8)) From 6e394a280e740dbf3781ef6e822a411cb40a6912 Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Fri, 27 Dec 2024 16:06:48 -0500 Subject: [PATCH 07/13] DeepCAM scripts and crontab for frontier --- DeepCAM/README.md | 131 +++++++++++++++++++++++++++++++ DeepCAM/run_frontier_16.sh | 132 ++++++++++++++++++++++++++++++++ DeepCAM/run_frontier_64.sh | 132 ++++++++++++++++++++++++++++++++ DeepCAM/run_frontier_crontab.sh | 19 +++++ 4 files changed, 414 insertions(+) create mode 100644 DeepCAM/README.md create mode 100644 DeepCAM/run_frontier_16.sh create mode 100644 DeepCAM/run_frontier_64.sh create mode 100644 DeepCAM/run_frontier_crontab.sh diff --git a/DeepCAM/README.md b/DeepCAM/README.md new file mode 100644 index 0000000..94e6880 --- /dev/null +++ b/DeepCAM/README.md @@ -0,0 +1,131 @@ +# DeepCAM README +For more detailed installation parameters, please refer to DeepCAM install guide + +Perlmutter Repository: [hpc_results_v3.0](https://github.com/hpcgroup/hpc_results_v3.0) +Frontier Repository: [hpc](https://github.com/hpcgroup/hpc) + + +## Perlmutter Setup + +### Setup steps + +## Frontier Setup + +### Setup steps + +#### 1. Pytorch Install +- Load modules + ```bash + module reset + module load PrgEnv-gnu/8.5.0 + module load rocm/6.1.3 + module load craype-accel-amd-gfx90a + module load cray-python/3.9.13.1 +- Create env variables + ```bash + DEEPCAM_ROOT=/lustre/orion/csc569/scratch/keshprad/deepcam/ + PYVENV_ROOT=${DEEPCAM_ROOT}/.venv + PYVENV_SITEPKGS=${PYVENV_ROOT}/lib/python3.9/site-packages + + cd ${DEEPCAM_ROOT} + ``` +- Create python virtual env + ```bash + python -m venv ${PYVENV_ROOT} + source ${PYVENV_ROOT}/bin/activate + ``` +- Install torch and mpi4py + ```bash + # torch==2.5.0 + pip install torch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/rocm6.1 + + MPICC="cc -shared" pip install --no-cache-dir --no-binary=mpi4py mpi4py + ``` +- Install AWS-OCI-RCCL plugin + ```bash + mkdir -p ${DEEPCAM_ROOT}/repos + cd ${DEEPCAM_ROOT}/repos + + rocm_version=6.1.3 + # Load modules + module load PrgEnv-gnu/8.5.0 + module load rocm/$rocm_version + module load craype-accel-amd-gfx90a + module load gcc-native/12.3 + module load cray-mpich/8.1.30 + #module load libtool + libfabric_path=/opt/cray/libfabric/1.15.2.0 + + # Download the plugin repo + git clone --recursive https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl + cd aws-ofi-rccl + + # Build the plugin + ./autogen.sh + export LD_LIBRARY_PATH=/opt/rocm-$rocm_version/hip/lib:$LD_LIBRARY_PATH + PLUG_PREFIX=$PWD + + CC=hipcc CFLAGS=-I/opt/rocm-$rocm_version/rccl/include ./configure \ + --with-libfabric=$libfabric_path --with-rccl=/opt/rocm-$rocm_version --enable-trace \ + --prefix=$PLUG_PREFIX --with-hip=/opt/rocm-$rocm_version/hip --with-mpi=$MPICH_DIR + + make + make install + + # Reminder to export the plugin to your path + echo $PLUG_PREFIX + echo "Add the following line in the environment to use the AWS OFI RCCL plugin" + echo "export LD_LIBRARY_PATH="$PLUG_PREFIX"/lib:$""LD_LIBRARY_PATH" + ``` +- Install supporting dependencies + ```bash + cd ${DEEPCAM_ROOT} + + pip install wandb + pip install gym + pip install pyspark + pip install scikit-learn + pip install scikit-image + pip install opencv-python + pip install wheel + pip install tomli + pip install h5py + + # tensorboard + pip install tensorboard + pip install tensorboard_plugin_profile + pip install tensorboard-plugin-wit + pip install tensorboard-pytorch + + pip install git+https://github.com/ildoonet/pytorch-gradual-warmup-lr.git + ``` +- Install mlperf-logging + ```bash + mkdir -p ${DEEPCAM_ROOT}/repos + cd ${DEEPCAM_ROOT}/repos + + git clone -b hpc-1.0-branch https://github.com/mlcommons/logging mlperf-logging + # may need to manually change mlperf-logging/VERSION to a valid version number (e.g. 1.0.0.rc2) + pip install -e mlperf-logging + + rm ${PYVENV_SITEPKGS}/mlperf-logging.egg-link + cp -r ./mlperf-logging/mlperf_logging ${PYVENV_SITEPKGS}/mlperf_logging + cp -r ./mlperf-logging/mlperf_logging.egg-info ${PYVENV_SITEPKGS}/mlperf_logging.egg-info + ``` + +#### 2. Download src code +- Download from PSSG Frontier repo for DeepCAM (linked at top of README) + ```bash + # REPLACE WITH YOUR PATH + PRFX=/lustre/orion/csc569/scratch/keshprad + DEEPCAM_ROOT=${PRFX}/deepcam + + mkdir -p ${DEEPCAM_ROOT} + cd ${DEEPCAM_ROOT} + + git clone https://github.com/hpcgroup/hpc.git hpc + ``` + +#### 3. Download dataset with globus +- [Globus Link](https://app.globus.org/file-manager?origin_id=0b226e2c-4de0-11ea-971a-021304b0cca7&origin_path=%2F) + - Download to `$DEEPCAM_ROOT/data` \ No newline at end of file diff --git a/DeepCAM/run_frontier_16.sh b/DeepCAM/run_frontier_16.sh new file mode 100644 index 0000000..14cef72 --- /dev/null +++ b/DeepCAM/run_frontier_16.sh @@ -0,0 +1,132 @@ +#!/bin/bash +#SBATCH -N 16 +#SBATCH -n 128 +#SBATCH -q normal +#SBATCH -J deepcam +#SBATCH -t 01:30:00 +#SBATCH -A csc569 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/16nodes/%x-%j/job-output.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/16nodes/%x-%j/job-error.log +#SBATCH --exclusive +# Run like: sbatch run_frontier_16.sh + +echo "start run: $(date)" +export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +OUTPUT_FILE=${JOB_OUTPUT_PATH}/output-deepcam.log +ERROR_FILE=${JOB_OUTPUT_PATH}/error-deepcam.log + +# Run gpu benchmarks +COMM_TYPE=rccl +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH + +APP_ROOT=/lustre/orion/csc569/scratch/keshprad/deepcam +APP_WORKING_DIR=${APP_ROOT}/hpc/deepcam/src/deepCam +cd $APP_WORKING_DIR + +# reset modules +echo resetting modules: +module reset +# load modules +echo loading modules: +module load PrgEnv-gnu/8.5.0 +module load rocm/6.1.3 +module load craype-accel-amd-gfx90a +module load cray-python/3.9.13.1 +module load ums/default +module load ums002/default +module load cray-hdf5-parallel/1.12.2.1 + +# activate virtual env +echo activating virtual env: +source ${APP_ROOT}/.venv/bin/activate + +# ENV variables +echo setting env vars: +mkdir -p ${JOB_OUTPUT_PATH} +export OMP_NUM_THREADS=1 +export RUN_TAG="${SLURM_JOB_NAME}-${SLURM_JOB_ID}" +export MASTER_ADDR=$(hostname -i) +export MASTER_PORT=3442 +export NCCL_SOCKET_IFNAME=hsn0 + +# Needed to bypass MIOpen, Disk I/O Errors +export MIOPEN_USER_DB_PATH="/tmp/my-miopen-cache-${SLURM_JOB_ID}" +export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} + +# Add AWS-OFI-RCCL +export LD_LIBRARY_PATH=${APP_ROOT}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH + +BENCH_RCP_FIXED="\ + --gradient_accumulation_frequency 1 \ + --logging_frequency 10 \ + --save_frequency 0 \ + --seed $(date +%s) \ + --batchnorm_group_size 1 \ + --target_iou 0.80" + +#BENCH_RCP_BASELINE_LR describes the learning rate for Baseline runs. +#It should not be modified. +BENCH_RCP_BASELINE_LR="\ + --start_lr 0.0055 \ + --lr_schedule type="multistep",milestones="800",decay_rate="0.1" \ + --lr_warmup_steps 400 \ + --lr_warmup_factor 1. \ + --weight_decay 1e-2 \ + --optimizer_betas 0.9 0.999" + +BENCH_RCP_BASELINE="\ + ${BENCH_RCP_FIXED} \ + ${BENCH_RCP_BASELINE_LR}" + +# define command +MAX_EPOCHS=2 +cmd="srun --export=ALL --tasks-per-node=8 --gpus-per-node=8 \ + --gpu-bind=closest --gpus-per-task=1 \ + --cpu-bind=none --hint=nomultithread \ + python train.py \ + ${BENCH_RCP_BASELINE} \ + --data_dir_prefix ${APP_ROOT}/data/All-Hist \ + --run_tag ${RUN_TAG} \ + --output_dir ${JOB_OUTPUT_PATH} \ + --wireup_method nccl-slurm \ + --max_epochs ${MAX_EPOCHS} \ + --optimizer "Adam" \ + --local_batch_size 2" + +# run without profiler +export WITH_PROFILER=0 +OUTPUT_FILE="$JOB_OUTPUT_PATH/output-deepcam_withoutprof.log" +# clear cache +rm -rf ${MIOPEN_USER_DB_PATH} +mkdir -p ${MIOPEN_USER_DB_PATH} +# log start date +echo "start deepcam_withoutprof: $(date)" &>> $OUTPUT_FILE +# execute command +echo $cmd &>> $OUTPUT_FILE +eval $cmd &>> $OUTPUT_FILE +# log end date +echo "end deepcam_withoutprof: $(date)" &>> $OUTPUT_FILE + + +# run with profiler +export WITH_PROFILER=1 +OUTPUT_FILE="$JOB_OUTPUT_PATH/output-deepcam.log" +# clear cache +rm -rf ${MIOPEN_USER_DB_PATH} +mkdir -p ${MIOPEN_USER_DB_PATH} +# log start date +echo "start deepcam: $(date)" &>> $OUTPUT_FILE +# execute command +echo $cmd &>> $OUTPUT_FILE +eval $cmd &>> $OUTPUT_FILE +# log end date +echo "end deepcam: $(date)" &>> $OUTPUT_FILE + +rm -rf ${MIOPEN_USER_DB_PATH} +echo "end run: $(date)" \ No newline at end of file diff --git a/DeepCAM/run_frontier_64.sh b/DeepCAM/run_frontier_64.sh new file mode 100644 index 0000000..48e7059 --- /dev/null +++ b/DeepCAM/run_frontier_64.sh @@ -0,0 +1,132 @@ +#!/bin/bash +#SBATCH -N 64 +#SBATCH -n 512 +#SBATCH -q normal +#SBATCH -J deepcam +#SBATCH -t 01:30:00 +#SBATCH -A csc569 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/64nodes/%x-%j/job-output.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/64nodes/%x-%j/job-error.log +#SBATCH --exclusive +# Run like: sbatch run_frontier_64.sh + +echo "start run: $(date)" +export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +OUTPUT_FILE=${JOB_OUTPUT_PATH}/output-deepcam.log +ERROR_FILE=${JOB_OUTPUT_PATH}/error-deepcam.log + +# Run gpu benchmarks +COMM_TYPE=rccl +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH + +APP_ROOT=/lustre/orion/csc569/scratch/keshprad/deepcam +APP_WORKING_DIR=${APP_ROOT}/hpc/deepcam/src/deepCam +cd $APP_WORKING_DIR + +# reset modules +echo resetting modules: +module reset +# load modules +echo loading modules: +module load PrgEnv-gnu/8.5.0 +module load rocm/6.1.3 +module load craype-accel-amd-gfx90a +module load cray-python/3.9.13.1 +module load ums/default +module load ums002/default +module load cray-hdf5-parallel/1.12.2.1 + +# activate virtual env +echo activating virtual env: +source ${APP_ROOT}/.venv/bin/activate + +# ENV variables +echo setting env vars: +mkdir -p ${JOB_OUTPUT_PATH} +export OMP_NUM_THREADS=1 +export RUN_TAG="${SLURM_JOB_NAME}-${SLURM_JOB_ID}" +export MASTER_ADDR=$(hostname -i) +export MASTER_PORT=3442 +export NCCL_SOCKET_IFNAME=hsn0 + +# Needed to bypass MIOpen, Disk I/O Errors +export MIOPEN_USER_DB_PATH="/tmp/my-miopen-cache-${SLURM_JOB_ID}" +export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} + +# Add AWS-OFI-RCCL +export LD_LIBRARY_PATH=${APP_ROOT}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH + +BENCH_RCP_FIXED="\ + --gradient_accumulation_frequency 1 \ + --logging_frequency 10 \ + --save_frequency 0 \ + --seed $(date +%s) \ + --batchnorm_group_size 1 \ + --target_iou 0.80" + +#BENCH_RCP_BASELINE_LR describes the learning rate for Baseline runs. +#It should not be modified. +BENCH_RCP_BASELINE_LR="\ + --start_lr 0.0055 \ + --lr_schedule type="multistep",milestones="800",decay_rate="0.1" \ + --lr_warmup_steps 400 \ + --lr_warmup_factor 1. \ + --weight_decay 1e-2 \ + --optimizer_betas 0.9 0.999" + +BENCH_RCP_BASELINE="\ + ${BENCH_RCP_FIXED} \ + ${BENCH_RCP_BASELINE_LR}" + +# define command +MAX_EPOCHS=8 +cmd="srun --export=ALL --tasks-per-node=8 --gpus-per-node=8 \ + --gpu-bind=closest --gpus-per-task=1 \ + --cpu-bind=none --hint=nomultithread \ + python train.py \ + ${BENCH_RCP_BASELINE} \ + --data_dir_prefix ${APP_ROOT}/data/All-Hist \ + --run_tag ${RUN_TAG} \ + --output_dir ${JOB_OUTPUT_PATH} \ + --wireup_method nccl-slurm \ + --max_epochs ${MAX_EPOCHS} \ + --optimizer "Adam" \ + --local_batch_size 2" + +# run without profiler +export WITH_PROFILER=0 +OUTPUT_FILE="$JOB_OUTPUT_PATH/output-deepcam_withoutprof.log" +# clear cache +rm -rf ${MIOPEN_USER_DB_PATH} +mkdir -p ${MIOPEN_USER_DB_PATH} +# log start date +echo "start deepcam_withoutprof: $(date)" &>> $OUTPUT_FILE +# execute command +echo $cmd &>> $OUTPUT_FILE +eval $cmd &>> $OUTPUT_FILE +# log end date +echo "end deepcam_withoutprof: $(date)" &>> $OUTPUT_FILE + + +# run with profiler +export WITH_PROFILER=1 +OUTPUT_FILE="$JOB_OUTPUT_PATH/output-deepcam.log" +# clear cache +rm -rf ${MIOPEN_USER_DB_PATH} +mkdir -p ${MIOPEN_USER_DB_PATH} +# log start date +echo "start deepcam: $(date)" &>> $OUTPUT_FILE +# execute command +echo $cmd &>> $OUTPUT_FILE +eval $cmd &>> $OUTPUT_FILE +# log end date +echo "end deepcam: $(date)" &>> $OUTPUT_FILE + +rm -rf ${MIOPEN_USER_DB_PATH} +echo "end run: $(date)" \ No newline at end of file diff --git a/DeepCAM/run_frontier_crontab.sh b/DeepCAM/run_frontier_crontab.sh new file mode 100644 index 0000000..6d70161 --- /dev/null +++ b/DeepCAM/run_frontier_crontab.sh @@ -0,0 +1,19 @@ +#!/bin/bash +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi +# `16` or `64` +NUM_NODES=$1 + +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability + +# load lmod +source /usr/share/lmod/lmod/init/bash +# load default LMOD_SYSTEM_DEFAULT_MODULES and MODULEPATH +export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-trento:craype-network-ofi:perftools-base:xpmem:cray-pmi:PrgEnv-cray:DefApps +export MODULEPATH=/sw/frontier/spack-envs/modules/cce/17.0.0/cray-mpich-8.1.28/cce-17.0.0:/sw/frontier/spack-envs/modules/cce/17.0.0/cce-17.0.0:/sw/frontier/spack-envs/modules/Core/24.07:/opt/cray/pe/lmod/modulefiles/mpi/crayclang/17.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/crayclang/17.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/compiler/crayclang/17.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-trento/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/sw/frontier/modulefiles:/opt/cray/modulefiles + +# run sbatch script +script=$PERF_VARIABILITY_ROOT/DeepCAM/run_frontier_$NUM_NODES\.sh +sbatch $script \ No newline at end of file From 56e0fd5a4b4f47b807fe6db75cfeb1ab4c5476a9 Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Sat, 28 Dec 2024 21:56:25 -0500 Subject: [PATCH 08/13] use gpu-bind=none for frontier --- AMG2023/run_frontier_16.sh | 1 + AMG2023/run_frontier_64.sh | 2 +- gpu-benchmarks/allgather/run_frontier.sh | 3 ++- gpu-benchmarks/allreduce/run_frontier.sh | 3 ++- gpu-benchmarks/gemm/run_frontier.sh | 3 ++- 5 files changed, 8 insertions(+), 4 deletions(-) diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh index d635c31..c0a69b0 100644 --- a/AMG2023/run_frontier_16.sh +++ b/AMG2023/run_frontier_16.sh @@ -3,6 +3,7 @@ #SBATCH -n 128 #SBATCH -q normal #SBATCH -J amg +#SBATCH --gpu-bind none #SBATCH -t 00:30:00 #SBATCH -A csc569 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/%x-%j/output-AMG2023.log diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh index 8854ca1..8baabe8 100644 --- a/AMG2023/run_frontier_64.sh +++ b/AMG2023/run_frontier_64.sh @@ -3,6 +3,7 @@ #SBATCH -n 512 #SBATCH -q normal #SBATCH -J amg +#SBATCH --gpu-bind none #SBATCH -t 00:30:00 #SBATCH -A csc569 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/%x-%j/output-AMG2023.log @@ -39,7 +40,6 @@ module load rocm/6.1.3 export MPICH_GPU_SUPPORT_ENABLED=1 export CRAY_ACCEL_TARGET=gfx90a export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ -export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so # mpiP export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH export MPIP="-o -f $OUTPUT_DIR" diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh index 75216e8..79cedc7 100644 --- a/gpu-benchmarks/allgather/run_frontier.sh +++ b/gpu-benchmarks/allgather/run_frontier.sh @@ -26,6 +26,7 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 module load cray-mpich/8.1.30 module load rocm/6.1.3 + module list GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x @@ -50,4 +51,4 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log echo $CMD $CMD echo end allgather: $(date) -} >> $OUTPUT_FILE +} &>> $OUTPUT_FILE diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh index 729c539..56bd2fe 100644 --- a/gpu-benchmarks/allreduce/run_frontier.sh +++ b/gpu-benchmarks/allreduce/run_frontier.sh @@ -26,6 +26,7 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 module load cray-mpich/8.1.30 module load rocm/6.1.3 + module list GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x @@ -45,4 +46,4 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log echo $CMD $CMD echo end allreduce: $(date) -} >> $OUTPUT_FILE +} &>> $OUTPUT_FILE diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh index d089dd1..9ccecbd 100644 --- a/gpu-benchmarks/gemm/run_frontier.sh +++ b/gpu-benchmarks/gemm/run_frontier.sh @@ -24,6 +24,7 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 module load cray-mpich/8.1.30 module load rocm/6.1.3 + module list GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x @@ -43,4 +44,4 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log echo $CMD $CMD echo end gemm: $(date) -} >> $OUTPUT_FILE +} &>> $OUTPUT_FILE From 175ee55cbf8adfb0ce633236d55dbb80e1064b7a Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Sat, 28 Dec 2024 21:59:15 -0500 Subject: [PATCH 09/13] use gpu-bind=none for deepcam on frontier --- DeepCAM/run_frontier_16.sh | 7 +++---- DeepCAM/run_frontier_64.sh | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/DeepCAM/run_frontier_16.sh b/DeepCAM/run_frontier_16.sh index 14cef72..e87f6e9 100644 --- a/DeepCAM/run_frontier_16.sh +++ b/DeepCAM/run_frontier_16.sh @@ -3,7 +3,8 @@ #SBATCH -n 128 #SBATCH -q normal #SBATCH -J deepcam -#SBATCH -t 01:30:00 +#SBATCH --gpu-bind none +#SBATCH -t 01:00:00 #SBATCH -A csc569 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/16nodes/%x-%j/job-output.log #SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/16nodes/%x-%j/job-error.log @@ -38,9 +39,7 @@ module load PrgEnv-gnu/8.5.0 module load rocm/6.1.3 module load craype-accel-amd-gfx90a module load cray-python/3.9.13.1 -module load ums/default -module load ums002/default -module load cray-hdf5-parallel/1.12.2.1 +module list # activate virtual env echo activating virtual env: diff --git a/DeepCAM/run_frontier_64.sh b/DeepCAM/run_frontier_64.sh index 48e7059..fbe0b29 100644 --- a/DeepCAM/run_frontier_64.sh +++ b/DeepCAM/run_frontier_64.sh @@ -3,7 +3,8 @@ #SBATCH -n 512 #SBATCH -q normal #SBATCH -J deepcam -#SBATCH -t 01:30:00 +#SBATCH --gpu-bind none +#SBATCH -t 01:00:00 #SBATCH -A csc569 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/64nodes/%x-%j/job-output.log #SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/64nodes/%x-%j/job-error.log @@ -38,9 +39,7 @@ module load PrgEnv-gnu/8.5.0 module load rocm/6.1.3 module load craype-accel-amd-gfx90a module load cray-python/3.9.13.1 -module load ums/default -module load ums002/default -module load cray-hdf5-parallel/1.12.2.1 +module list # activate virtual env echo activating virtual env: From a087255bc2a22e152a5508f3103c88bfc7847ebc Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Mon, 13 Jan 2025 17:28:22 -0500 Subject: [PATCH 10/13] update gpu-benchmarks to specify ROCM version --- AMG2023/run_frontier_16.sh | 7 ++++--- AMG2023/run_frontier_64.sh | 7 ++++--- gpu-benchmarks/allgather/run_frontier.sh | 25 ++++++++++++++++-------- gpu-benchmarks/allreduce/run_frontier.sh | 25 ++++++++++++++++-------- gpu-benchmarks/gemm/run_frontier.sh | 25 ++++++++++++++++-------- 5 files changed, 59 insertions(+), 30 deletions(-) diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh index c0a69b0..c51b52d 100644 --- a/AMG2023/run_frontier_16.sh +++ b/AMG2023/run_frontier_16.sh @@ -17,13 +17,14 @@ ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log # Run gpu benchmarks COMM_TYPE=mpi +ROCM_VERSION=6.1.3 PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability echo running allreduce benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR # echo running allgather benchmark -# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR echo running gemm benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $OUTPUT_DIR +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR APP_ROOT=/ccs/home/keshprad/AMG2023 cd $APP_ROOT diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh index 8baabe8..c7a7a3e 100644 --- a/AMG2023/run_frontier_64.sh +++ b/AMG2023/run_frontier_64.sh @@ -17,13 +17,14 @@ ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log # Run gpu benchmarks COMM_TYPE=mpi +ROCM_VERSION=6.1.3 PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability echo running allreduce benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR # echo running allgather benchmark -# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR echo running gemm benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $OUTPUT_DIR +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR APP_ROOT=/ccs/home/keshprad/AMG2023 cd $APP_ROOT diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh index 79cedc7..7fc10b4 100644 --- a/gpu-benchmarks/allgather/run_frontier.sh +++ b/gpu-benchmarks/allgather/run_frontier.sh @@ -4,16 +4,25 @@ # run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allgather.sh #!/bin/bash -if [ "$#" -ne 3 ]; then - echo "Usage: $0 " +if [ "$#" -ne 4 ]; then + echo "Usage: $0 " exit 1 fi # `mpi` or `rccl` COMM_TYPE=$1 +# `5.7.1` or `6.1.3` +ROCM_VERSION=$2 # `16` or `64` -NUM_NODES=$2 +NUM_NODES=$3 # output directory -OUTPUT_DIR=$3 +OUTPUT_DIR=$4 + +# setup cray-mpich version +if [[ "$ROCM_VERSION" == "6.1.3" ]]; then + MPICH_VERSION=8.1.30 +else + MPICH_VERSION=8.1.28 +fi OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log @@ -23,13 +32,13 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log module reset # load modules echo loading modules: - module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 - module load cray-mpich/8.1.30 - module load rocm/6.1.3 + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION} + module load cray-mpich/${MPICH_VERSION} + module load rocm/${ROCM_VERSION} module list GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks - EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x + EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE\_rocm-${ROCM_VERSION}.x NUM_TASKS=$(($NUM_NODES * 8)) MIN_MSG_SIZE=$((1 * 1024)) MAX_MSG_SIZE=$((1 * 1024 * 1024)) diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh index 56bd2fe..855a486 100644 --- a/gpu-benchmarks/allreduce/run_frontier.sh +++ b/gpu-benchmarks/allreduce/run_frontier.sh @@ -4,16 +4,25 @@ # run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allreduce.sh #!/bin/bash -if [ "$#" -ne 3 ]; then - echo "Usage: $0 " +if [ "$#" -ne 4 ]; then + echo "Usage: $0 " exit 1 fi # `mpi` or `rccl` COMM_TYPE=$1 +# `5.7.1` or `6.1.3` +ROCM_VERSION=$2 # `16` or `64` -NUM_NODES=$2 +NUM_NODES=$3 # output directory -OUTPUT_DIR=$3 +OUTPUT_DIR=$4 + +# setup cray-mpich version +if [[ "$ROCM_VERSION" == "6.1.3" ]]; then + MPICH_VERSION=8.1.30 +else + MPICH_VERSION=8.1.28 +fi OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log @@ -23,13 +32,13 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log module reset # load modules echo loading modules: - module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 - module load cray-mpich/8.1.30 - module load rocm/6.1.3 + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION} + module load cray-mpich/${MPICH_VERSION} + module load rocm/${ROCM_VERSION} module list GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks - EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x + EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE\_rocm-${ROCM_VERSION}.x NUM_TASKS=$(($NUM_NODES * 8)) MIN_MSG_SIZE=$((1 * 1024)) MAX_MSG_SIZE=$((1 * 1024 * 1024)) diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh index 9ccecbd..c5348be 100644 --- a/gpu-benchmarks/gemm/run_frontier.sh +++ b/gpu-benchmarks/gemm/run_frontier.sh @@ -4,14 +4,23 @@ # run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/gemm.sh #!/bin/bash -if [ "$#" -ne 2 ]; then - echo "Usage: $0 " +if [ "$#" -ne 3 ]; then + echo "Usage: $0 " exit 1 fi +# `5.7.1` or `6.1.3` +ROCM_VERSION=$1 # `16` or `64` -NUM_NODES=$1 +NUM_NODES=$2 # output directory -OUTPUT_DIR=$2 +OUTPUT_DIR=$3 + +# setup cray-mpich version +if [[ "$ROCM_VERSION" == "6.1.3" ]]; then + MPICH_VERSION=8.1.30 +else + MPICH_VERSION=8.1.28 +fi OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log @@ -21,13 +30,13 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log module reset # load modules echo loading modules: - module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3 - module load cray-mpich/8.1.30 - module load rocm/6.1.3 + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION} + module load cray-mpich/${MPICH_VERSION} + module load rocm/${ROCM_VERSION} module list GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks - EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x + EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm_rocm-${ROCM_VERSION}.x NUM_TASKS=$(($NUM_NODES * 8)) export MPICH_GPU_SUPPORT_ENABLED=1 From a68f2a16f3b0a1cded21ddf8b7cafd46c293527a Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Mon, 13 Jan 2025 17:36:47 -0500 Subject: [PATCH 11/13] updated nanogpt scripts and reduced batch size due to HIP OOM errors --- nanoGPT/run_frontier16.sh | 84 +++++++++++++-------------- nanoGPT/run_frontier64.sh | 84 +++++++++++++-------------- nanoGPT/train_gpt_neox_5B_frontier.py | 4 +- 3 files changed, 82 insertions(+), 90 deletions(-) diff --git a/nanoGPT/run_frontier16.sh b/nanoGPT/run_frontier16.sh index 63718c5..901561e 100644 --- a/nanoGPT/run_frontier16.sh +++ b/nanoGPT/run_frontier16.sh @@ -3,30 +3,19 @@ #SBATCH -n 128 #SBATCH -q normal #SBATCH -J nanogpt -#SBATCH -t 01:00:00 +#SBATCH --gpu-bind none +#SBATCH -t 00:30:00 #SBATCH -A csc569 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-output.log #SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-error.log #SBATCH --exclusive # Run like: sbatch run_frontier16.sh +echo "start run: $(date)" export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log -# Run gpu benchmarks -COMM_TYPE=rccl -PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability -echo running allreduce benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH -# echo running allgather benchmark -# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH -echo running gemm benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH - -APP_ROOT=/lustre/orion/csc569/scratch/keshprad/nanoGPT -cd $APP_ROOT - export SCRATCH="/lustre/orion/csc569/scratch/keshprad" export WRKSPC="${SCRATCH}/nanoGPT" export HF_HOME="${SCRATCH}/.cache/hf" @@ -35,56 +24,63 @@ export HF_DATASETS_CACHE="${HF_HOME}/datasets" cd $WRKSPC # load modules -rocm_version=6.1.3 +ROCM_VERSION=6.1.3 +echo resetting modules: module reset +echo loading modules: module load PrgEnv-gnu/8.5.0 -module load rocm/${rocm_version} +module load rocm/${ROCM_VERSION} module load craype-accel-amd-gfx90a module load cray-python/3.9.13.1 -module load gcc-native/12.3 module load cray-mpich/8.1.30 +module list # activate env source ${WRKSPC}/axonn_nanogpt/bin/activate NNODES=$SLURM_JOB_NUM_NODES GPUS=$(( NNODES * 8 )) ## master addr and port -export MASTER_ADDR=$(hostname -i) -export MASTER_PORT=3442 -export WORLD_SIZE=${GPUS} +# setting variables for torch.distributed +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export WORLD_SIZE=$GPUS +export OMP_NUM_THREADS=7 -## nccl env vars to speedup stuff -export CUDA_DEVICE_MAX_CONNECTIONS=1 -export NCCL_NET_GDR_LEVEL=PHB -export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0 +## some RCCL env variables +export FI_CXI_ATS=0 +export HSA_FORCE_FINE_GRAIN_PCIE=1 export NCCL_CROSS_NIC=1 export NCCL_SOCKET_IFNAME=hsn0 -export NCCL_NET="AWS Libfabric" -export NCCL_TIMEOUT=1200 -export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=1200 -export MPICH_GPU_SUPPORT_ENABLED=0 +export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0 +export CUDA_DEVICE_MAX_CONNECTIONS=1 # AWS-OFI-RCCL export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH" +# other +export MPICH_GPU_SUPPORT_ENABLED=1 +export GPU_MAX_HW_QUEUES=1 +export OFI_NCCL_USE_IPV6_TCP=1 SCRIPT="train_frontier.py config/train_gpt_neox_5B.py" -# run without profiler -export WITH_PROFILER=0 -# log start date -echo start nanoGPT_withoutprof: $(date) -run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT_withoutprof.log" -echo $run_cmd -eval $run_cmd -# log end date -echo end nanoGPT_withoutprof: $(date) - - # run with profiler export WITH_PROFILER=1 +OUTPUT_FILE="$JOB_OUTPUT_PATH/output-nanoGPT.log" # log start date -echo start nanoGPT: $(date) -run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT.log" -echo $run_cmd -eval $run_cmd +echo "start nanoGPT: $(date)" &>> $OUTPUT_FILE +run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT" +echo $run_cmd &>> $OUTPUT_FILE +eval $run_cmd &>> $OUTPUT_FILE # log end date -echo end nanoGPT: $(date) +echo "end nanoGPT: $(date)" &>> $OUTPUT_FILE + +# Run gpu benchmarks +COMM_TYPE=rccl +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH + +echo "end run: $(date)" \ No newline at end of file diff --git a/nanoGPT/run_frontier64.sh b/nanoGPT/run_frontier64.sh index 1c9a75b..3201b51 100644 --- a/nanoGPT/run_frontier64.sh +++ b/nanoGPT/run_frontier64.sh @@ -3,30 +3,19 @@ #SBATCH -n 512 #SBATCH -q normal #SBATCH -J nanogpt -#SBATCH -t 01:00:00 +#SBATCH --gpu-bind none +#SBATCH -t 00:30:00 #SBATCH -A csc569 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-output.log #SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-error.log #SBATCH --exclusive # Run like: sbatch run_frontier64.sh +echo "start run: $(date)" export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log -# Run gpu benchmarks -COMM_TYPE=rccl -PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability -echo running allreduce benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH -# echo running allgather benchmark -# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH -echo running gemm benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH - -APP_ROOT=/lustre/orion/csc569/scratch/keshprad/nanoGPT -cd $APP_ROOT - export SCRATCH="/lustre/orion/csc569/scratch/keshprad" export WRKSPC="${SCRATCH}/nanoGPT" export HF_HOME="${SCRATCH}/.cache/hf" @@ -35,56 +24,63 @@ export HF_DATASETS_CACHE="${HF_HOME}/datasets" cd $WRKSPC # load modules -rocm_version=6.1.3 +ROCM_VERSION=6.1.3 +echo resetting modules: module reset +echo loading modules: module load PrgEnv-gnu/8.5.0 -module load rocm/${rocm_version} +module load rocm/${ROCM_VERSION} module load craype-accel-amd-gfx90a module load cray-python/3.9.13.1 -module load gcc-native/12.3 module load cray-mpich/8.1.30 +module list # activate env source ${WRKSPC}/axonn_nanogpt/bin/activate NNODES=$SLURM_JOB_NUM_NODES GPUS=$(( NNODES * 8 )) ## master addr and port -export MASTER_ADDR=$(hostname -i) -export MASTER_PORT=3442 -export WORLD_SIZE=${GPUS} +# setting variables for torch.distributed +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export WORLD_SIZE=$GPUS +export OMP_NUM_THREADS=7 -## nccl env vars to speedup stuff -export CUDA_DEVICE_MAX_CONNECTIONS=1 -export NCCL_NET_GDR_LEVEL=PHB -export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0 +## some RCCL env variables +export FI_CXI_ATS=0 +export HSA_FORCE_FINE_GRAIN_PCIE=1 export NCCL_CROSS_NIC=1 export NCCL_SOCKET_IFNAME=hsn0 -export NCCL_NET="AWS Libfabric" -export NCCL_TIMEOUT=1200 -export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=1200 -export MPICH_GPU_SUPPORT_ENABLED=0 +export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0 +export CUDA_DEVICE_MAX_CONNECTIONS=1 # AWS-OFI-RCCL export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH" +# other +export MPICH_GPU_SUPPORT_ENABLED=1 +export GPU_MAX_HW_QUEUES=1 +export OFI_NCCL_USE_IPV6_TCP=1 SCRIPT="train_frontier.py config/train_gpt_neox_20B.py" -# run without profiler -export WITH_PROFILER=0 -# log start date -echo start nanoGPT_withoutprof: $(date) -run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT_withoutprof.log" -echo $run_cmd -eval $run_cmd -# log end date -echo end nanoGPT_withoutprof: $(date) - - # run with profiler export WITH_PROFILER=1 +OUTPUT_FILE="$JOB_OUTPUT_PATH/output-nanoGPT.log" # log start date -echo start nanoGPT: $(date) -run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT.log" -echo $run_cmd -eval $run_cmd +echo "start nanoGPT: $(date)" &>> $OUTPUT_FILE +run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT" +echo $run_cmd &>> $OUTPUT_FILE +eval $run_cmd &>> $OUTPUT_FILE # log end date -echo end nanoGPT: $(date) +echo "end nanoGPT: $(date)" &>> $OUTPUT_FILE + +# Run gpu benchmarks +COMM_TYPE=rccl +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH + +echo "end run: $(date)" \ No newline at end of file diff --git a/nanoGPT/train_gpt_neox_5B_frontier.py b/nanoGPT/train_gpt_neox_5B_frontier.py index 5fcc430..4ce7b55 100644 --- a/nanoGPT/train_gpt_neox_5B_frontier.py +++ b/nanoGPT/train_gpt_neox_5B_frontier.py @@ -8,9 +8,9 @@ # these make the total batch size be ~0.5M # 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520 -batch_size = 32 +batch_size = 16 block_size = 512 -gradient_accumulation_steps = 1 * 128 #per_gpu x num_gpus +gradient_accumulation_steps = 2 * 128 #per_gpu x num_gpus # model n_layer = 24 From 2d2689ca7190fe975e040f0de85d5e3431b5740d Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Mon, 13 Jan 2025 17:39:39 -0500 Subject: [PATCH 12/13] updated deepcam scripts to resolve nccl issues --- DeepCAM/run_frontier_16.sh | 79 +++++++++++++++++++------------------- DeepCAM/run_frontier_64.sh | 79 +++++++++++++++++++------------------- 2 files changed, 80 insertions(+), 78 deletions(-) diff --git a/DeepCAM/run_frontier_16.sh b/DeepCAM/run_frontier_16.sh index e87f6e9..593608a 100644 --- a/DeepCAM/run_frontier_16.sh +++ b/DeepCAM/run_frontier_16.sh @@ -4,7 +4,7 @@ #SBATCH -q normal #SBATCH -J deepcam #SBATCH --gpu-bind none -#SBATCH -t 01:00:00 +#SBATCH -t 00:30:00 #SBATCH -A csc569 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/16nodes/%x-%j/job-output.log #SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/16nodes/%x-%j/job-error.log @@ -16,24 +16,15 @@ export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_log OUTPUT_FILE=${JOB_OUTPUT_PATH}/output-deepcam.log ERROR_FILE=${JOB_OUTPUT_PATH}/error-deepcam.log -# Run gpu benchmarks -COMM_TYPE=rccl -PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability -echo running allreduce benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH -# echo running allgather benchmark -# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH -echo running gemm benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH - -APP_ROOT=/lustre/orion/csc569/scratch/keshprad/deepcam +export SCRATCH="/lustre/orion/csc569/scratch/keshprad" +export APP_ROOT="${SCRATCH}/deepcam" APP_WORKING_DIR=${APP_ROOT}/hpc/deepcam/src/deepCam cd $APP_WORKING_DIR -# reset modules +# load modules +ROCM_VERSION=6.1.3 echo resetting modules: module reset -# load modules echo loading modules: module load PrgEnv-gnu/8.5.0 module load rocm/6.1.3 @@ -47,20 +38,36 @@ source ${APP_ROOT}/.venv/bin/activate # ENV variables echo setting env vars: -mkdir -p ${JOB_OUTPUT_PATH} -export OMP_NUM_THREADS=1 -export RUN_TAG="${SLURM_JOB_NAME}-${SLURM_JOB_ID}" -export MASTER_ADDR=$(hostname -i) -export MASTER_PORT=3442 -export NCCL_SOCKET_IFNAME=hsn0 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 8 )) + +## master addr and port +# setting variables for torch.distributed +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export WORLD_SIZE=$GPUS +export OMP_NUM_THREADS=7 # Needed to bypass MIOpen, Disk I/O Errors export MIOPEN_USER_DB_PATH="/tmp/my-miopen-cache-${SLURM_JOB_ID}" export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -# Add AWS-OFI-RCCL +## some RCCL env variables +export FI_CXI_ATS=0 +export HSA_FORCE_FINE_GRAIN_PCIE=1 +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn0 +export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +# AWS-OFI-RCCL export LD_LIBRARY_PATH=${APP_ROOT}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH +# other +export MPICH_GPU_SUPPORT_ENABLED=1 +export GPU_MAX_HW_QUEUES=1 +export OFI_NCCL_USE_IPV6_TCP=1 +# deepcam setup +export RUN_TAG="${SLURM_JOB_NAME}-${SLURM_JOB_ID}" BENCH_RCP_FIXED="\ --gradient_accumulation_frequency 1 \ --logging_frequency 10 \ @@ -68,7 +75,6 @@ BENCH_RCP_FIXED="\ --seed $(date +%s) \ --batchnorm_group_size 1 \ --target_iou 0.80" - #BENCH_RCP_BASELINE_LR describes the learning rate for Baseline runs. #It should not be modified. BENCH_RCP_BASELINE_LR="\ @@ -78,13 +84,12 @@ BENCH_RCP_BASELINE_LR="\ --lr_warmup_factor 1. \ --weight_decay 1e-2 \ --optimizer_betas 0.9 0.999" - BENCH_RCP_BASELINE="\ ${BENCH_RCP_FIXED} \ ${BENCH_RCP_BASELINE_LR}" # define command -MAX_EPOCHS=2 +MAX_EPOCHS=1 cmd="srun --export=ALL --tasks-per-node=8 --gpus-per-node=8 \ --gpu-bind=closest --gpus-per-task=1 \ --cpu-bind=none --hint=nomultithread \ @@ -98,21 +103,6 @@ cmd="srun --export=ALL --tasks-per-node=8 --gpus-per-node=8 \ --optimizer "Adam" \ --local_batch_size 2" -# run without profiler -export WITH_PROFILER=0 -OUTPUT_FILE="$JOB_OUTPUT_PATH/output-deepcam_withoutprof.log" -# clear cache -rm -rf ${MIOPEN_USER_DB_PATH} -mkdir -p ${MIOPEN_USER_DB_PATH} -# log start date -echo "start deepcam_withoutprof: $(date)" &>> $OUTPUT_FILE -# execute command -echo $cmd &>> $OUTPUT_FILE -eval $cmd &>> $OUTPUT_FILE -# log end date -echo "end deepcam_withoutprof: $(date)" &>> $OUTPUT_FILE - - # run with profiler export WITH_PROFILER=1 OUTPUT_FILE="$JOB_OUTPUT_PATH/output-deepcam.log" @@ -128,4 +118,15 @@ eval $cmd &>> $OUTPUT_FILE echo "end deepcam: $(date)" &>> $OUTPUT_FILE rm -rf ${MIOPEN_USER_DB_PATH} + +# Run gpu benchmarks +COMM_TYPE=rccl +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH + echo "end run: $(date)" \ No newline at end of file diff --git a/DeepCAM/run_frontier_64.sh b/DeepCAM/run_frontier_64.sh index fbe0b29..5c406fe 100644 --- a/DeepCAM/run_frontier_64.sh +++ b/DeepCAM/run_frontier_64.sh @@ -4,7 +4,7 @@ #SBATCH -q normal #SBATCH -J deepcam #SBATCH --gpu-bind none -#SBATCH -t 01:00:00 +#SBATCH -t 00:30:00 #SBATCH -A csc569 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/64nodes/%x-%j/job-output.log #SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/64nodes/%x-%j/job-error.log @@ -16,24 +16,15 @@ export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_log OUTPUT_FILE=${JOB_OUTPUT_PATH}/output-deepcam.log ERROR_FILE=${JOB_OUTPUT_PATH}/error-deepcam.log -# Run gpu benchmarks -COMM_TYPE=rccl -PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability -echo running allreduce benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH -# echo running allgather benchmark -# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH -echo running gemm benchmark -bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH - -APP_ROOT=/lustre/orion/csc569/scratch/keshprad/deepcam +export SCRATCH="/lustre/orion/csc569/scratch/keshprad" +export APP_ROOT="${SCRATCH}/deepcam" APP_WORKING_DIR=${APP_ROOT}/hpc/deepcam/src/deepCam cd $APP_WORKING_DIR -# reset modules +# load modules +ROCM_VERSION=6.1.3 echo resetting modules: module reset -# load modules echo loading modules: module load PrgEnv-gnu/8.5.0 module load rocm/6.1.3 @@ -47,20 +38,36 @@ source ${APP_ROOT}/.venv/bin/activate # ENV variables echo setting env vars: -mkdir -p ${JOB_OUTPUT_PATH} -export OMP_NUM_THREADS=1 -export RUN_TAG="${SLURM_JOB_NAME}-${SLURM_JOB_ID}" -export MASTER_ADDR=$(hostname -i) -export MASTER_PORT=3442 -export NCCL_SOCKET_IFNAME=hsn0 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 8 )) + +## master addr and port +# setting variables for torch.distributed +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export WORLD_SIZE=$GPUS +export OMP_NUM_THREADS=7 # Needed to bypass MIOpen, Disk I/O Errors export MIOPEN_USER_DB_PATH="/tmp/my-miopen-cache-${SLURM_JOB_ID}" export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} -# Add AWS-OFI-RCCL +## some RCCL env variables +export FI_CXI_ATS=0 +export HSA_FORCE_FINE_GRAIN_PCIE=1 +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn0 +export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +# AWS-OFI-RCCL export LD_LIBRARY_PATH=${APP_ROOT}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH +# other +export MPICH_GPU_SUPPORT_ENABLED=1 +export GPU_MAX_HW_QUEUES=1 +export OFI_NCCL_USE_IPV6_TCP=1 +# deepcam setup +export RUN_TAG="${SLURM_JOB_NAME}-${SLURM_JOB_ID}" BENCH_RCP_FIXED="\ --gradient_accumulation_frequency 1 \ --logging_frequency 10 \ @@ -68,7 +75,6 @@ BENCH_RCP_FIXED="\ --seed $(date +%s) \ --batchnorm_group_size 1 \ --target_iou 0.80" - #BENCH_RCP_BASELINE_LR describes the learning rate for Baseline runs. #It should not be modified. BENCH_RCP_BASELINE_LR="\ @@ -78,13 +84,12 @@ BENCH_RCP_BASELINE_LR="\ --lr_warmup_factor 1. \ --weight_decay 1e-2 \ --optimizer_betas 0.9 0.999" - BENCH_RCP_BASELINE="\ ${BENCH_RCP_FIXED} \ ${BENCH_RCP_BASELINE_LR}" # define command -MAX_EPOCHS=8 +MAX_EPOCHS=4 cmd="srun --export=ALL --tasks-per-node=8 --gpus-per-node=8 \ --gpu-bind=closest --gpus-per-task=1 \ --cpu-bind=none --hint=nomultithread \ @@ -98,21 +103,6 @@ cmd="srun --export=ALL --tasks-per-node=8 --gpus-per-node=8 \ --optimizer "Adam" \ --local_batch_size 2" -# run without profiler -export WITH_PROFILER=0 -OUTPUT_FILE="$JOB_OUTPUT_PATH/output-deepcam_withoutprof.log" -# clear cache -rm -rf ${MIOPEN_USER_DB_PATH} -mkdir -p ${MIOPEN_USER_DB_PATH} -# log start date -echo "start deepcam_withoutprof: $(date)" &>> $OUTPUT_FILE -# execute command -echo $cmd &>> $OUTPUT_FILE -eval $cmd &>> $OUTPUT_FILE -# log end date -echo "end deepcam_withoutprof: $(date)" &>> $OUTPUT_FILE - - # run with profiler export WITH_PROFILER=1 OUTPUT_FILE="$JOB_OUTPUT_PATH/output-deepcam.log" @@ -128,4 +118,15 @@ eval $cmd &>> $OUTPUT_FILE echo "end deepcam: $(date)" &>> $OUTPUT_FILE rm -rf ${MIOPEN_USER_DB_PATH} + +# Run gpu benchmarks +COMM_TYPE=rccl +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH + echo "end run: $(date)" \ No newline at end of file From 94b59495c2bf67fd1c06d7f7341b1b7edce3f0f6 Mon Sep 17 00:00:00 2001 From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com> Date: Mon, 13 Jan 2025 17:41:38 -0500 Subject: [PATCH 13/13] milc scripts and params --- MILC/params_frontier.40.16 | 60 ++++ MILC/params_frontier.40.64 | 60 ++++ MILC/rat.m013m065m838 | 554 +++++++++++++++++++++++++++++++++++ MILC/run_frontier_40.16.sh | 76 +++++ MILC/run_frontier_40.64.sh | 76 +++++ MILC/run_frontier_crontab.sh | 19 ++ 6 files changed, 845 insertions(+) create mode 100644 MILC/params_frontier.40.16 create mode 100644 MILC/params_frontier.40.64 create mode 100644 MILC/rat.m013m065m838 create mode 100644 MILC/run_frontier_40.16.sh create mode 100644 MILC/run_frontier_40.64.sh create mode 100644 MILC/run_frontier_crontab.sh diff --git a/MILC/params_frontier.40.16 b/MILC/params_frontier.40.16 new file mode 100644 index 0000000..3593cab --- /dev/null +++ b/MILC/params_frontier.40.16 @@ -0,0 +1,60 @@ +prompt 0 +nx 80 +ny 160 +nz 160 +nt 160 +node_geometry 2 4 4 4 +ionode_geometry 2 4 4 4 +iseed 5682304 +n_pseudo 5 +load_rhmc_params rat.m013m065m838 +beta 5.60 +n_dyn_masses 3 +dyn_mass 0.013 0.065 0.838 +dyn_flavors 2 1 1 +u0 0.85535 + +warms 0 +trajecs 2 +traj_between_meas 2 +microcanonical_time_step 0.2 +steps_per_trajectory 2 +cgresid_md_fa_gr .000000025 .00000002 .00000002 +max_multicg_md_fa_gr 1750 1750 1750 +cgprec_md_fa_gr 2 2 2 +cgresid_md_fa_gr .00000005 .00000002 .00000002 +max_multicg_md_fa_gr 1750 1750 1750 +cgprec_md_fa_gr 2 2 2 +cgresid_md_fa_gr .00000005 .00000002 .00000002 +max_multicg_md_fa_gr 1750 1750 1750 +cgprec_md_fa_gr 2 2 2 +cgresid_md_fa_gr .00000005 .00000002 .00000002 +max_multicg_md_fa_gr 1750 1750 1750 +cgprec_md_fa_gr 2 2 2 +cgresid_md_fa_gr .000000005 .000000002 .000000002 +max_multicg_md_fa_gr 1750 1750 1750 +cgprec_md_fa_gr 2 2 2 +prec_ff 1 + +number_of_pbp_masses 3 +max_cg_prop 1750 +max_cg_prop_restarts 5 +npbp_reps 1 +prec_pbp 2 +mass 0.013 +naik_term_epsilon 0 +error_for_propagator 2e-7 +rel_error_for_propagator 0 +mass 0.065 +naik_term_epsilon 0 +error_for_propagator 2e-7 +rel_error_for_propagator 0 +mass 0.838 +naik_term_epsilon -0.358197 +error_for_propagator 2e-8 +rel_error_for_propagator 0 + +fresh +#reload_serial l1216b560m013m065m838.test +forget + diff --git a/MILC/params_frontier.40.64 b/MILC/params_frontier.40.64 new file mode 100644 index 0000000..73826ed --- /dev/null +++ b/MILC/params_frontier.40.64 @@ -0,0 +1,60 @@ +prompt 0 +nx 80 +ny 160 +nz 320 +nt 320 +node_geometry 2 4 8 8 +ionode_geometry 2 4 8 8 +iseed 5682304 +n_pseudo 5 +load_rhmc_params rat.m013m065m838 +beta 5.60 +n_dyn_masses 3 +dyn_mass 0.013 0.065 0.838 +dyn_flavors 2 1 1 +u0 0.85535 + +warms 0 +trajecs 2 +traj_between_meas 2 +microcanonical_time_step 0.2 +steps_per_trajectory 2 +cgresid_md_fa_gr .000000025 .00000002 .00000002 +max_multicg_md_fa_gr 1750 1750 1750 +cgprec_md_fa_gr 2 2 2 +cgresid_md_fa_gr .00000005 .00000002 .00000002 +max_multicg_md_fa_gr 1750 1750 1750 +cgprec_md_fa_gr 2 2 2 +cgresid_md_fa_gr .00000005 .00000002 .00000002 +max_multicg_md_fa_gr 1750 1750 1750 +cgprec_md_fa_gr 2 2 2 +cgresid_md_fa_gr .00000005 .00000002 .00000002 +max_multicg_md_fa_gr 1750 1750 1750 +cgprec_md_fa_gr 2 2 2 +cgresid_md_fa_gr .000000005 .000000002 .000000002 +max_multicg_md_fa_gr 1750 1750 1750 +cgprec_md_fa_gr 2 2 2 +prec_ff 1 + +number_of_pbp_masses 3 +max_cg_prop 1750 +max_cg_prop_restarts 5 +npbp_reps 1 +prec_pbp 2 +mass 0.013 +naik_term_epsilon 0 +error_for_propagator 2e-7 +rel_error_for_propagator 0 +mass 0.065 +naik_term_epsilon 0 +error_for_propagator 2e-7 +rel_error_for_propagator 0 +mass 0.838 +naik_term_epsilon -0.358197 +error_for_propagator 2e-8 +rel_error_for_propagator 0 + +fresh +#reload_serial l1216b560m013m065m838.test +forget + diff --git a/MILC/rat.m013m065m838 b/MILC/rat.m013m065m838 new file mode 100644 index 0000000..af0212f --- /dev/null +++ b/MILC/rat.m013m065m838 @@ -0,0 +1,554 @@ +n_pseudo 5 + +naik_term_epsilon 0 + + + +# New rational function +# Approximation bounds are [1.000000e-15,9.000000e+01] +# Precision of arithmetic is 75 +# Degree of the approximation is (9,9) +# Approximating the function (x+4*0.013000^2)^(2/4) (x+4*0.065000^2)^(1/4) (x+4*0.200000^2)^(-3/4) (x+4*99.900000^2)^(0/4) +# Converged at 1215 iterations, error = 4.464654e-10 + + +# Rational function for MD +y_MD -2 -1 3 0 +z_MD 4 4 4 4 +m_MD 0.013000 0.065000 0.200000 99.900000 +order_MD 9 + +res_MD 1.0000000005312786e+00 +res_MD 5.1478424553065552e-03 +res_MD 6.1255750425527403e-03 +res_MD 8.2831942811878966e-03 +res_MD 1.2118242740786475e-02 +res_MD 1.8952479058480964e-02 +res_MD 2.9418638688841297e-02 +res_MD 1.8470115784153345e-02 +res_MD 1.2658218218545392e-02 +res_MD 4.2626859206241910e-03 + +pole_MD 99.9 +pole_MD 7.0774135543816004e-04 +pole_MD 9.9827758499373954e-04 +pole_MD 1.8047757109780971e-03 +pole_MD 3.7411067368715600e-03 +pole_MD 8.1989400404790334e-03 +pole_MD 1.6961565896872453e-02 +pole_MD 3.6947931361687537e-02 +pole_MD 7.5128029017199366e-02 +pole_MD 1.2749566440898313e-01 + +# CHECK: f(1.000000e-15) = 2.698640e+01 = 2.698640e+01? + + + +# New rational function +# Approximation bounds are [1.000000e-15,9.000000e+01] +# Precision of arithmetic is 75 +# Degree of the approximation is (11,11) +# Approximating the function (x+4*0.013000^2)^(2/8) (x+4*0.065000^2)^(1/8) (x+4*0.200000^2)^(-3/8) (x+4*99.900000^2)^(0/8) +# Converged at 1458 iterations, error = 3.338720e-12 + + +# Rational function for GR +y_GR 2 1 -3 0 +z_GR 8 8 8 8 +m_GR 0.013000 0.065000 0.200000 99.900000 +order_GR 11 + +res_GR 9.9999999999569866e-01 +res_GR -4.0060866788845987e-06 +res_GR -1.5999218783896372e-05 +res_GR -4.4655743629733473e-05 +res_GR -1.1314117169439806e-04 +res_GR -2.7282105670893862e-04 +res_GR -6.1067990531200617e-04 +res_GR -1.9139700349766983e-03 +res_GR -5.0187173668868809e-03 +res_GR -1.0796690978882167e-02 +res_GR -1.8360260474439800e-02 +res_GR -2.0567557871129686e-02 + +pole_GR 99.9 +pole_GR 7.4665670046294302e-04 +pole_GR 1.0337590207549056e-03 +pole_GR 1.6932805716009024e-03 +pole_GR 3.0741725059849415e-03 +pole_GR 5.8941171906434549e-03 +pole_GR 1.1579298991070606e-02 +pole_GR 2.3594965292107566e-02 +pole_GR 4.3836643614606971e-02 +pole_GR 7.6692462791476457e-02 +pole_GR 1.1910887316297561e-01 +pole_GR 1.5357310472237296e-01 + +# CHECK: f(1.000000e-15) = 1.924986e-01 = 1.924986e-01? + + +# Rational function for FA +y_FA -2 -1 3 0 +z_FA 8 8 8 8 +m_FA 0.013000 0.065000 0.200000 99.900000 +order_FA 11 + +res_FA 1.0000000000043012e+00 +res_FA 2.7624276919541819e-04 +res_FA 5.2622338238058285e-04 +res_FA 8.8065939393702753e-04 +res_FA 1.4738849748348461e-03 +res_FA 2.5034718455081732e-03 +res_FA 4.4044075660614667e-03 +res_FA 9.0907008499393795e-03 +res_FA 1.1167054802824348e-02 +res_FA 1.2487688569309182e-02 +res_FA 1.0529250690731072e-02 +res_FA 4.3789150648981231e-03 + +pole_FA 99.9 +pole_FA 7.1219307391404054e-04 +pole_FA 9.3554136557657695e-04 +pole_FA 1.4789436727802529e-03 +pole_FA 2.6307764047071667e-03 +pole_FA 4.9930192831341728e-03 +pole_FA 9.7679165430922116e-03 +pole_FA 1.8753770710693469e-02 +pole_FA 3.4890062285776381e-02 +pole_FA 6.2841673100822434e-02 +pole_FA 1.0283414918156956e-01 +pole_FA 1.4327078903261969e-01 + +# CHECK: f(1.000000e-15) = 5.194844e+00 = 5.194844e+00? +naik_term_epsilon 0 + + + +# New rational function +# Approximation bounds are [1.000000e-15,9.000000e+01] +# Precision of arithmetic is 75 +# Degree of the approximation is (7,7) +# Approximating the function (x+4*0.200000^2)^(1/4) (x+4*99.900000^2)^(0/4) (x+4*99.900000^2)^(0/4) (x+4*99.900000^2)^(0/4) +# Converged at 327 iterations, error = 2.398230e-07 + + +# Rational function for MD +y_MD -1 0 0 0 +z_MD 4 4 4 4 +m_MD 0.200000 99.900000 99.900000 99.900000 +order_MD 7 + +res_MD 1.4922969612472456e-01 +res_MD 4.6061009721530329e-02 +res_MD 1.1379997711196768e-01 +res_MD 2.7453631915038118e-01 +res_MD 6.8761853150948216e-01 +res_MD 1.8320055253741194e+00 +res_MD 5.8748098933529755e+00 +res_MD 3.8086202810075271e+01 + +pole_MD 99.9 +pole_MD 1.8528257058192529e-01 +pole_MD 3.7539911712311358e-01 +pole_MD 1.0581221519179564e+00 +pole_MD 3.4031251236671252e+00 +pole_MD 1.1740502300224540e+01 +pole_MD 4.5730041178720789e+01 +pole_MD 2.8391564624137345e+02 + +# CHECK: f(1.000000e-15) = 1.581138e+00 = 1.581139e+00? + + + +# New rational function +# Approximation bounds are [1.000000e-15,9.000000e+01] +# Precision of arithmetic is 75 +# Degree of the approximation is (9,9) +# Approximating the function (x+4*0.200000^2)^(1/8) (x+4*99.900000^2)^(0/8) (x+4*99.900000^2)^(0/8) (x+4*99.900000^2)^(0/8) +# Converged at 422 iterations, error = 1.700228e-09 + + +# Rational function for GR +y_GR 1 0 0 0 +z_GR 8 8 8 8 +m_GR 0.200000 99.900000 99.900000 99.900000 +order_GR 9 + +res_GR 2.7327733614742757e+00 +res_GR -5.1288622160737063e-03 +res_GR -2.0478919078421993e-02 +res_GR -6.3951352353102550e-02 +res_GR -1.9268600508174474e-01 +res_GR -5.8519459892515535e-01 +res_GR -1.8580289566259904e+00 +res_GR -6.6894626972932318e+00 +res_GR -3.4401695696851874e+01 +res_GR -6.1789841402475429e+02 + +pole_GR 99.9 +pole_GR 1.8642524371106700e-01 +pole_GR 3.1582609770026626e-01 +pole_GR 6.7910597147834129e-01 +pole_GR 1.6420879353554103e+00 +pole_GR 4.2025399779967945e+00 +pole_GR 1.1215270946596673e+01 +pole_GR 3.2072436285328692e+01 +pole_GR 1.1032630752536383e+02 +pole_GR 7.6441549844451458e+02 + +# CHECK: f(1.000000e-15) = 7.952707e-01 = 7.952707e-01? + + +# Rational function for FA +y_FA -1 0 0 0 +z_FA 8 8 8 8 +m_FA 0.200000 99.900000 99.900000 99.900000 +order_FA 9 + +res_FA 3.6592862551196720e-01 +res_FA 1.0931558584388677e-02 +res_FA 2.9290340023195943e-02 +res_FA 6.7875566115597738e-02 +res_FA 1.5709264288335881e-01 +res_FA 3.7002733866738813e-01 +res_FA 9.0645109653452494e-01 +res_FA 2.4523352073991669e+00 +res_FA 8.7150280660454786e+00 +res_FA 7.5897099555179366e+01 + +pole_FA 99.9 +pole_FA 1.7887536305510454e-01 +pole_FA 2.9094384593655243e-01 +pole_FA 6.1203693854712271e-01 +pole_FA 1.4648617324427462e+00 +pole_FA 3.7284495585739217e+00 +pole_FA 9.8932956134892081e+00 +pole_FA 2.7949316233288556e+01 +pole_FA 9.2734993617230558e+01 +pole_FA 5.4606225005034105e+02 + +# CHECK: f(1.000000e-15) = 1.257433e+00 = 1.257433e+00? +naik_term_epsilon 0 + + + +# New rational function +# Approximation bounds are [1.000000e-15,9.000000e+01] +# Precision of arithmetic is 75 +# Degree of the approximation is (7,7) +# Approximating the function (x+4*0.200000^2)^(1/4) (x+4*99.900000^2)^(0/4) (x+4*99.900000^2)^(0/4) (x+4*99.900000^2)^(0/4) +# Converged at 327 iterations, error = 2.398230e-07 + + +# Rational function for MD +y_MD -1 0 0 0 +z_MD 4 4 4 4 +m_MD 0.200000 99.900000 99.900000 99.900000 +order_MD 7 + +res_MD 1.4922969612472456e-01 +res_MD 4.6061009721530329e-02 +res_MD 1.1379997711196768e-01 +res_MD 2.7453631915038118e-01 +res_MD 6.8761853150948216e-01 +res_MD 1.8320055253741194e+00 +res_MD 5.8748098933529755e+00 +res_MD 3.8086202810075271e+01 + +pole_MD 99.9 +pole_MD 1.8528257058192529e-01 +pole_MD 3.7539911712311358e-01 +pole_MD 1.0581221519179564e+00 +pole_MD 3.4031251236671252e+00 +pole_MD 1.1740502300224540e+01 +pole_MD 4.5730041178720789e+01 +pole_MD 2.8391564624137345e+02 + +# CHECK: f(1.000000e-15) = 1.581138e+00 = 1.581139e+00? + + + +# New rational function +# Approximation bounds are [1.000000e-15,9.000000e+01] +# Precision of arithmetic is 75 +# Degree of the approximation is (9,9) +# Approximating the function (x+4*0.200000^2)^(1/8) (x+4*99.900000^2)^(0/8) (x+4*99.900000^2)^(0/8) (x+4*99.900000^2)^(0/8) +# Converged at 422 iterations, error = 1.700228e-09 + + +# Rational function for GR +y_GR 1 0 0 0 +z_GR 8 8 8 8 +m_GR 0.200000 99.900000 99.900000 99.900000 +order_GR 9 + +res_GR 2.7327733614742757e+00 +res_GR -5.1288622160737063e-03 +res_GR -2.0478919078421993e-02 +res_GR -6.3951352353102550e-02 +res_GR -1.9268600508174474e-01 +res_GR -5.8519459892515535e-01 +res_GR -1.8580289566259904e+00 +res_GR -6.6894626972932318e+00 +res_GR -3.4401695696851874e+01 +res_GR -6.1789841402475429e+02 + +pole_GR 99.9 +pole_GR 1.8642524371106700e-01 +pole_GR 3.1582609770026626e-01 +pole_GR 6.7910597147834129e-01 +pole_GR 1.6420879353554103e+00 +pole_GR 4.2025399779967945e+00 +pole_GR 1.1215270946596673e+01 +pole_GR 3.2072436285328692e+01 +pole_GR 1.1032630752536383e+02 +pole_GR 7.6441549844451458e+02 + +# CHECK: f(1.000000e-15) = 7.952707e-01 = 7.952707e-01? + + +# Rational function for FA +y_FA -1 0 0 0 +z_FA 8 8 8 8 +m_FA 0.200000 99.900000 99.900000 99.900000 +order_FA 9 + +res_FA 3.6592862551196720e-01 +res_FA 1.0931558584388677e-02 +res_FA 2.9290340023195943e-02 +res_FA 6.7875566115597738e-02 +res_FA 1.5709264288335881e-01 +res_FA 3.7002733866738813e-01 +res_FA 9.0645109653452494e-01 +res_FA 2.4523352073991669e+00 +res_FA 8.7150280660454786e+00 +res_FA 7.5897099555179366e+01 + +pole_FA 99.9 +pole_FA 1.7887536305510454e-01 +pole_FA 2.9094384593655243e-01 +pole_FA 6.1203693854712271e-01 +pole_FA 1.4648617324427462e+00 +pole_FA 3.7284495585739217e+00 +pole_FA 9.8932956134892081e+00 +pole_FA 2.7949316233288556e+01 +pole_FA 9.2734993617230558e+01 +pole_FA 5.4606225005034105e+02 + +# CHECK: f(1.000000e-15) = 1.257433e+00 = 1.257433e+00? +naik_term_epsilon 0 + + + +# New rational function +# Approximation bounds are [1.000000e-15,9.000000e+01] +# Precision of arithmetic is 75 +# Degree of the approximation is (7,7) +# Approximating the function (x+4*0.200000^2)^(1/4) (x+4*99.900000^2)^(0/4) (x+4*99.900000^2)^(0/4) (x+4*99.900000^2)^(0/4) +# Converged at 327 iterations, error = 2.398230e-07 + + +# Rational function for MD +y_MD -1 0 0 0 +z_MD 4 4 4 4 +m_MD 0.200000 99.900000 99.900000 99.900000 +order_MD 7 + +res_MD 1.4922969612472456e-01 +res_MD 4.6061009721530329e-02 +res_MD 1.1379997711196768e-01 +res_MD 2.7453631915038118e-01 +res_MD 6.8761853150948216e-01 +res_MD 1.8320055253741194e+00 +res_MD 5.8748098933529755e+00 +res_MD 3.8086202810075271e+01 + +pole_MD 99.9 +pole_MD 1.8528257058192529e-01 +pole_MD 3.7539911712311358e-01 +pole_MD 1.0581221519179564e+00 +pole_MD 3.4031251236671252e+00 +pole_MD 1.1740502300224540e+01 +pole_MD 4.5730041178720789e+01 +pole_MD 2.8391564624137345e+02 + +# CHECK: f(1.000000e-15) = 1.581138e+00 = 1.581139e+00? + + + +# New rational function +# Approximation bounds are [1.000000e-15,9.000000e+01] +# Precision of arithmetic is 75 +# Degree of the approximation is (9,9) +# Approximating the function (x+4*0.200000^2)^(1/8) (x+4*99.900000^2)^(0/8) (x+4*99.900000^2)^(0/8) (x+4*99.900000^2)^(0/8) +# Converged at 422 iterations, error = 1.700228e-09 + + +# Rational function for GR +y_GR 1 0 0 0 +z_GR 8 8 8 8 +m_GR 0.200000 99.900000 99.900000 99.900000 +order_GR 9 + +res_GR 2.7327733614742757e+00 +res_GR -5.1288622160737063e-03 +res_GR -2.0478919078421993e-02 +res_GR -6.3951352353102550e-02 +res_GR -1.9268600508174474e-01 +res_GR -5.8519459892515535e-01 +res_GR -1.8580289566259904e+00 +res_GR -6.6894626972932318e+00 +res_GR -3.4401695696851874e+01 +res_GR -6.1789841402475429e+02 + +pole_GR 99.9 +pole_GR 1.8642524371106700e-01 +pole_GR 3.1582609770026626e-01 +pole_GR 6.7910597147834129e-01 +pole_GR 1.6420879353554103e+00 +pole_GR 4.2025399779967945e+00 +pole_GR 1.1215270946596673e+01 +pole_GR 3.2072436285328692e+01 +pole_GR 1.1032630752536383e+02 +pole_GR 7.6441549844451458e+02 + +# CHECK: f(1.000000e-15) = 7.952707e-01 = 7.952707e-01? + + +# Rational function for FA +y_FA -1 0 0 0 +z_FA 8 8 8 8 +m_FA 0.200000 99.900000 99.900000 99.900000 +order_FA 9 + +res_FA 3.6592862551196720e-01 +res_FA 1.0931558584388677e-02 +res_FA 2.9290340023195943e-02 +res_FA 6.7875566115597738e-02 +res_FA 1.5709264288335881e-01 +res_FA 3.7002733866738813e-01 +res_FA 9.0645109653452494e-01 +res_FA 2.4523352073991669e+00 +res_FA 8.7150280660454786e+00 +res_FA 7.5897099555179366e+01 + +pole_FA 99.9 +pole_FA 1.7887536305510454e-01 +pole_FA 2.9094384593655243e-01 +pole_FA 6.1203693854712271e-01 +pole_FA 1.4648617324427462e+00 +pole_FA 3.7284495585739217e+00 +pole_FA 9.8932956134892081e+00 +pole_FA 2.7949316233288556e+01 +pole_FA 9.2734993617230558e+01 +pole_FA 5.4606225005034105e+02 + +# CHECK: f(1.000000e-15) = 1.257433e+00 = 1.257433e+00? +naik_term_epsilon -0.358197 + + + +# New rational function +# Approximation bounds are [1.000000e-15,9.000000e+01] +# Precision of arithmetic is 75 +# Degree of the approximation is (7,7) +# Approximating the function (x+4*0.838000^2)^(1/4) (x+4*99.900000^2)^(0/4) (x+4*99.900000^2)^(0/4) (x+4*99.900000^2)^(0/4) +# Converged at 254 iterations, error = 1.451256e-10 + + +# Rational function for MD +y_MD -1 0 0 0 +z_MD 4 4 4 4 +m_MD 0.838000 99.900000 99.900000 99.900000 +order_MD 7 + +res_MD 1.2322471525321263e-01 +res_MD 2.1771921408584441e-01 +res_MD 4.4206637506002772e-01 +res_MD 8.2618794992683819e-01 +res_MD 1.6297128076156222e+00 +res_MD 3.6167435981445921e+00 +res_MD 1.0509605478166112e+01 +res_MD 6.7097093571965161e+01 + +pole_MD 99.9 +pole_MD 3.0160974627586552e+00 +pole_MD 4.3550360604645304e+00 +pole_MD 7.9796160098146069e+00 +pole_MD 1.7084127350226733e+01 +pole_MD 4.1574237400987570e+01 +pole_MD 1.2322623145255724e+02 +pole_MD 6.4326627115402380e+02 + +# CHECK: f(1.000000e-15) = 7.724369e-01 = 7.724369e-01? + + + +# New rational function +# Approximation bounds are [1.000000e-15,9.000000e+01] +# Precision of arithmetic is 75 +# Degree of the approximation is (9,9) +# Approximating the function (x+4*0.838000^2)^(1/8) (x+4*99.900000^2)^(0/8) (x+4*99.900000^2)^(0/8) (x+4*99.900000^2)^(0/8) +# Converged at 328 iterations, error = 1.426221e-13 + + +# Rational function for GR +y_GR 1 0 0 0 +z_GR 8 8 8 8 +m_GR 0.838000 99.900000 99.900000 99.900000 +order_GR 9 + +res_GR 3.0073112563922351e+00 +res_GR -5.3234413512341650e-02 +res_GR -1.7673165828433515e-01 +res_GR -4.2726410063882991e-01 +res_GR -9.7790664416295114e-01 +res_GR -2.2882846047810173e+00 +res_GR -5.8208997626948111e+00 +res_GR -1.7844293878766898e+01 +res_GR -8.4032413065091191e+01 +res_GR -1.4669209010508403e+03 + +pole_GR 99.9 +pole_GR 3.0252410237704845e+00 +pole_GR 3.9660872619800918e+00 +pole_GR 6.1039445287657701e+00 +pole_GR 1.0530577643916541e+01 +pole_GR 1.9759567382759283e+01 +pole_GR 4.0165388592627771e+01 +pole_GR 9.1668640228708554e+01 +pole_GR 2.6672834021452144e+02 +pole_GR 1.6783103311924824e+03 + +# CHECK: f(1.000000e-15) = 1.137807e+00 = 1.137807e+00? + + +# Rational function for FA +y_FA -1 0 0 0 +z_FA 8 8 8 8 +m_FA 0.838000 99.900000 99.900000 99.900000 +order_FA 9 + +res_FA 3.3252294649396041e-01 +res_FA 6.7957577334402247e-02 +res_FA 1.5690077204608199e-01 +res_FA 2.9390758191833499e-01 +res_FA 5.4247602885208523e-01 +res_FA 1.0357646568428991e+00 +res_FA 2.1330614641295562e+00 +res_FA 5.1181689865260589e+00 +res_FA 1.7166623006718723e+01 +res_FA 1.4802690454646185e+02 + +pole_FA 99.9 +pole_FA 2.9645701362629495e+00 +pole_FA 3.7967708400810514e+00 +pole_FA 5.7427951679138749e+00 +pole_FA 9.7876489526599268e+00 +pole_FA 1.8188863360964667e+01 +pole_FA 3.6571166564954488e+01 +pole_FA 8.1929059816469248e+01 +pole_FA 2.2810985650690961e+02 +pole_FA 1.2082659732685304e+03 + +# CHECK: f(1.000000e-15) = 8.788839e-01 = 8.788839e-01? diff --git a/MILC/run_frontier_40.16.sh b/MILC/run_frontier_40.16.sh new file mode 100644 index 0000000..0f76532 --- /dev/null +++ b/MILC/run_frontier_40.16.sh @@ -0,0 +1,76 @@ +#!/bin/bash +#SBATCH --job-name=milc_40.16 +#SBATCH --account=csc569 +#SBATCH --partition=batch +#SBATCH --qos=normal +#SBATCH --nodes=16 +#SBATCH --ntasks=128 +#SBATCH --gpu-bind=none +#SBATCH --exclusive +#SBATCH -t 00:30:00 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/MILC_logs/16nodes/%x-%j/job-output.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/MILC_logs/16nodes/%x-%j/job-error.log +# RUN LIKE: sbatch run_frontier_40.16.sh + +echo "start run: $(date)" + +ORION_SCRATCH=/lustre/orion/csc569/scratch/keshprad +OUTPUT_DIR=$ORION_SCRATCH/perfvar/MILC_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +MILC_OUTPUT_FILE=$OUTPUT_DIR/output-MILC.log + +# Run gpu benchmarks +COMM_TYPE=mpi +ROCM_VERSION=6.1.3 +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR + +# define paths variables +BENCH_TOPDIR=/ccs/home/keshprad/MILC/OLCF-6_MILC_benchmark +MILC_QCD_DIR=${BENCH_TOPDIR}/build/milc_qcd +exe=${MILC_QCD_DIR}/ks_imp_rhmc/su3_rhmd_hisq +input=$PERF_VARIABILITY_ROOT/MILC/params_frontier.40.16 +# Load modules, setup environment +source ${BENCH_TOPDIR}/build/env.sh + +# Define environment variables +# mpich +export MPICH_RDMA_ENABLED_CUDA=1 +export MPICH_GPU_SUPPORT_ENABLED=1 +export MPICH_NEMESIS_ASYNC_PROGRESS=1 +# quda +export QUDA_ENABLE_GDR=1 +export QUDA_ENABLE_P2P=1 +export QUDA_MILC_HISQ_RECONSTRUCT=13 +export QUDA_MILC_HISQ_RECONSTRUCT_SLOPPY=9 +# omp +export OMP_NUM_THREADS=7 +export OMP_PROC_BIND="spread, spread, spread" +export SLURM_CPU_BIND="cores" + +# qudatune +# Tuning results are stored in qudatune_dir. +export QUDA_RESOURCE_PATH="$ORION_SCRATCH/perfvar/MILC/qudatune_40.16" +if [ ! -d ${QUDA_RESOURCE_PATH} ]; then + mkdir -p ${QUDA_RESOURCE_PATH} +fi + +# mpiP +export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP_rocm-${CRAY_ROCM_VERSION}:$LD_LIBRARY_PATH +export MPIP="-o -f $OUTPUT_DIR" +export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP_rocm-${CRAY_ROCM_VERSION}/libmpiP.so + +# run milc +cd $PERF_VARIABILITY_ROOT/MILC/ +command="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \ + -n $SLURM_NTASKS -c 7 \ + $exe $input $MILC_OUTPUT_FILE" +echo running milc +echo $command &>> $MILC_OUTPUT_FILE +eval $command &>> $MILC_OUTPUT_FILE + +echo end run: $(date) \ No newline at end of file diff --git a/MILC/run_frontier_40.64.sh b/MILC/run_frontier_40.64.sh new file mode 100644 index 0000000..3bbbebe --- /dev/null +++ b/MILC/run_frontier_40.64.sh @@ -0,0 +1,76 @@ +#!/bin/bash +#SBATCH --job-name=milc_40.64 +#SBATCH --account=csc569 +#SBATCH --partition=batch +#SBATCH --qos=normal +#SBATCH --nodes=64 +#SBATCH --ntasks=512 +#SBATCH --gpu-bind=none +#SBATCH --exclusive +#SBATCH -t 00:30:00 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/MILC_logs/64nodes/%x-%j/job-output.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/MILC_logs/64nodes/%x-%j/job-error.log +# RUN LIKE: sbatch run_frontier_40.64.sh + +echo "start run: $(date)" + +ORION_SCRATCH=/lustre/orion/csc569/scratch/keshprad +OUTPUT_DIR=$ORION_SCRATCH/perfvar/MILC_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +MILC_OUTPUT_FILE=$OUTPUT_DIR/output-MILC.log + +# Run gpu benchmarks +COMM_TYPE=mpi +ROCM_VERSION=6.1.3 +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR + +# define paths variables +BENCH_TOPDIR=/ccs/home/keshprad/MILC/OLCF-6_MILC_benchmark +MILC_QCD_DIR=${BENCH_TOPDIR}/build/milc_qcd +exe=${MILC_QCD_DIR}/ks_imp_rhmc/su3_rhmd_hisq +input=$PERF_VARIABILITY_ROOT/MILC/params_frontier.40.64 +# Load modules, setup environment +source ${BENCH_TOPDIR}/build/env.sh + +# Define environment variables +# mpich +export MPICH_RDMA_ENABLED_CUDA=1 +export MPICH_GPU_SUPPORT_ENABLED=1 +export MPICH_NEMESIS_ASYNC_PROGRESS=1 +# quda +export QUDA_ENABLE_GDR=1 +export QUDA_ENABLE_P2P=1 +export QUDA_MILC_HISQ_RECONSTRUCT=13 +export QUDA_MILC_HISQ_RECONSTRUCT_SLOPPY=9 +# omp +export OMP_NUM_THREADS=7 +export OMP_PROC_BIND="spread, spread, spread" +export SLURM_CPU_BIND="cores" + +# qudatune +# Tuning results are stored in qudatune_dir. +export QUDA_RESOURCE_PATH="$ORION_SCRATCH/perfvar/MILC/qudatune_40.64" +if [ ! -d ${QUDA_RESOURCE_PATH} ]; then + mkdir -p ${QUDA_RESOURCE_PATH} +fi + +# mpiP +export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP_rocm-${CRAY_ROCM_VERSION}:$LD_LIBRARY_PATH +export MPIP="-o -f $OUTPUT_DIR" +export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP_rocm-${CRAY_ROCM_VERSION}/libmpiP.so + +# log date +cd $PERF_VARIABILITY_ROOT/MILC/ +command="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \ + -n $SLURM_NTASKS -c 7 \ + $exe $input $MILC_OUTPUT_FILE" +echo running milc +echo $command &>> $MILC_OUTPUT_FILE +eval $command &>> $MILC_OUTPUT_FILE + +echo end run: $(date) \ No newline at end of file diff --git a/MILC/run_frontier_crontab.sh b/MILC/run_frontier_crontab.sh new file mode 100644 index 0000000..8aa0ef3 --- /dev/null +++ b/MILC/run_frontier_crontab.sh @@ -0,0 +1,19 @@ +#!/bin/bash +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi +# `16` or `64` +NUM_NODES=$1 + +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability + +# load lmod +source /usr/share/lmod/lmod/init/bash +# load default LMOD_SYSTEM_DEFAULT_MODULES and MODULEPATH +export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-trento:craype-network-ofi:perftools-base:xpmem:cray-pmi:PrgEnv-cray:DefApps +export MODULEPATH=/sw/frontier/spack-envs/modules/cce/17.0.0/cray-mpich-8.1.28/cce-17.0.0:/sw/frontier/spack-envs/modules/cce/17.0.0/cce-17.0.0:/sw/frontier/spack-envs/modules/Core/24.07:/opt/cray/pe/lmod/modulefiles/mpi/crayclang/17.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/crayclang/17.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/compiler/crayclang/17.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-trento/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/sw/frontier/modulefiles:/opt/cray/modulefiles + +# run sbatch script +script=$PERF_VARIABILITY_ROOT/MILC/run_frontier_40.$NUM_NODES\.sh +sbatch $script \ No newline at end of file