From ca42dbd33517c93b924609833ce4d92fc32b7ad2 Mon Sep 17 00:00:00 2001
From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com>
Date: Wed, 4 Dec 2024 14:46:33 -0500
Subject: [PATCH 1/9] add frontier install instrs for AMG2023

---
 AMG2023/README.md | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/AMG2023/README.md b/AMG2023/README.md
index 476ad56..71af9f6 100644
--- a/AMG2023/README.md
+++ b/AMG2023/README.md
@@ -51,4 +51,47 @@ Repository: [AMG2023](https://github.com/pssg-int/AMG2023)
     ```
 
 ## Frontier Installation
+1. Load modules
+    ```sh
+    module reset
+
+    module load cray-mpich/8.1.28
+    module load craype-accel-amd-gfx90a
+    module load rocm
+    export MPICH_GPU_SUPPORT_ENABLED=1
+
+    # load compatible cmake version
+    module load Core/24.07
+    module load cmake/3.27.9
+    ```
+2. Configure hypre
+    - Clone hypre v2.27.0 and navigate to src: 
+        ```sh
+        git clone -b v2.27.0 https://github.com/hypre-space/hypre.git
+        cd into ~/hypre/src
+        ```
+    - Configure hypre (in hypre/src)
+        ```sh
+        ./configure --with-hip --with-gpu-arch=gfx90a --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" --with-MPI-include="${MPICH_DIR}/include"
+        ```
+    - Compile hypre (in hypre/src)
+        ```sh
+        # build with make
+        make
+        ```
+3. Configure AMG2023
+    - Clone repo: 
+        ```sh
+        git clone https://github.com/pssg-int/AMG2023`
+        cd AMG2023
+        ```
+    - Configure cmake
+        ```sh
+        mkdir build && cd build
 
+        cmake .. -DHYPRE_PREFIX=/ccs/home/keshprad/hypre/src/hypre/ -DCMAKE_EXE_LINKER_FLAGS="-lrocsparse -lrocrand"
+        ```
+    - Compile AMG2023 (in AMG2023/build)
+        ```sh
+        make install
+        ```

From 28abeca38d9f9396e89d5ef2091720a371f5f4f8 Mon Sep 17 00:00:00 2001
From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com>
Date: Tue, 17 Dec 2024 18:06:04 -0500
Subject: [PATCH 2/9] frontier scripts for AMG and for gpu benchmarks

---
 AMG2023/run_frontier_16.sh               | 56 ++++++++++++++++++++++++
 AMG2023/run_frontier_64.sh               | 56 ++++++++++++++++++++++++
 AMG2023/run_frontier_crontab.sh          | 19 ++++++++
 gpu-benchmarks/allgather/run_frontier.sh | 51 +++++++++++++++++++++
 gpu-benchmarks/allreduce/run_frontier.sh | 46 +++++++++++++++++++
 gpu-benchmarks/gemm/run_frontier.sh      | 44 +++++++++++++++++++
 6 files changed, 272 insertions(+)
 create mode 100644 AMG2023/run_frontier_16.sh
 create mode 100644 AMG2023/run_frontier_64.sh
 create mode 100644 AMG2023/run_frontier_crontab.sh
 create mode 100644 gpu-benchmarks/allgather/run_frontier.sh
 create mode 100644 gpu-benchmarks/allreduce/run_frontier.sh
 create mode 100644 gpu-benchmarks/gemm/run_frontier.sh

diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh
new file mode 100644
index 0000000..8546887
--- /dev/null
+++ b/AMG2023/run_frontier_16.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+#SBATCH -N 16
+#SBATCH -n 128
+#SBATCH -q normal
+#SBATCH -J amg
+#SBATCH -t 00:30:00
+#SBATCH -A csc569
+#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/%x-%j/output-AMG2023.log
+#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/%x-%j/error-AMG2023.log
+#SBATCH --exclusive
+# Run like: sbatch run_frontier_16.sh
+
+OUTPUT_DIR=/lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
+OUTPUT_FILE=$OUTPUT_DIR/output-AMG2023.log
+ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log
+
+# Run gpu benchmarks
+COMM_TYPE=mpi
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+
+APP_ROOT=/ccs/home/keshprad/AMG2023
+cd $APP_ROOT
+
+# reset modules
+echo resetting modules:
+module reset
+# load modules
+echo loading modules:
+module load cray-mpich/8.1.28
+module load craype-accel-amd-gfx90a
+module load rocm
+
+export MPICH_GPU_SUPPORT_ENABLED=1
+export CRAY_ACCEL_TARGET=gfx90a
+export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/
+export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so
+export MPIP="-f $OUTPUT_DIR"
+
+# log start date
+echo start AMG2023: $(date)
+# define command
+cmd="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \
+        --output $OUTPUT_FILE \
+        --error $ERROR_FILE \
+        ./build/amg -P 4 4 8 -n 128 64 64 -problem 1 -iter 500"
+echo solving:
+echo $cmd
+$cmd
+# log end date
+echo end AMG2023: $(date)
diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh
new file mode 100644
index 0000000..c28de6a
--- /dev/null
+++ b/AMG2023/run_frontier_64.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+#SBATCH -N 64
+#SBATCH -n 512
+#SBATCH -q normal
+#SBATCH -J amg
+#SBATCH -t 00:30:00
+#SBATCH -A csc569
+#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/%x-%j/output-AMG2023.log
+#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/%x-%j/error-AMG2023.log
+#SBATCH --exclusive
+# Run like: sbatch run_frontier_64.sh
+
+OUTPUT_DIR=/lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
+OUTPUT_FILE=$OUTPUT_DIR/output-AMG2023.log
+ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log
+
+# Run gpu benchmarks
+COMM_TYPE=mpi
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+
+APP_ROOT=/ccs/home/keshprad/AMG2023
+cd $APP_ROOT
+
+# reset modules
+echo resetting modules:
+module reset
+# load modules
+echo loading modules:
+module load cray-mpich/8.1.28
+module load craype-accel-amd-gfx90a
+module load rocm
+
+export MPICH_GPU_SUPPORT_ENABLED=1
+export CRAY_ACCEL_TARGET=gfx90a
+export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/
+export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so
+export MPIP="-f $OUTPUT_DIR"
+
+# log start date
+echo start AMG2023: $(date)
+# define command
+cmd="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \
+        --output $OUTPUT_FILE \
+        --error $ERROR_FILE \
+        ./build/amg -P 8 8 8 -n 128 64 64 -problem 1 -iter 500"
+echo solving:
+echo $cmd
+$cmd
+# log end date
+echo end AMG2023: $(date)
diff --git a/AMG2023/run_frontier_crontab.sh b/AMG2023/run_frontier_crontab.sh
new file mode 100644
index 0000000..09b0f66
--- /dev/null
+++ b/AMG2023/run_frontier_crontab.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <number_of_nodes>"
+    exit 1
+fi
+# `16` or `64`
+NUM_NODES=$1
+
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+
+# load lmod
+source /usr/share/lmod/lmod/init/bash
+# load default LMOD_SYSTEM_DEFAULT_MODULES and MODULEPATH
+export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-trento:craype-network-ofi:perftools-base:xpmem:cray-pmi:PrgEnv-cray:DefApps
+export MODULEPATH=/sw/frontier/spack-envs/modules/cce/17.0.0/cray-mpich-8.1.28/cce-17.0.0:/sw/frontier/spack-envs/modules/cce/17.0.0/cce-17.0.0:/sw/frontier/spack-envs/modules/Core/24.07:/opt/cray/pe/lmod/modulefiles/mpi/crayclang/17.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/crayclang/17.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/compiler/crayclang/17.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-trento/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/sw/frontier/modulefiles:/opt/cray/modulefiles
+
+# run sbatch script
+script=$PERF_VARIABILITY_ROOT/AMG2023/run_frontier_$NUM_NODES\.sh
+sbatch $script
\ No newline at end of file
diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh
new file mode 100644
index 0000000..dfd7bfe
--- /dev/null
+++ b/gpu-benchmarks/allgather/run_frontier.sh
@@ -0,0 +1,51 @@
+# This script assumes it is being run by another sbatch script, 
+# so does not include portions for SBATCH vars (e.g. account, time, etc.)
+
+# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allgather.sh <comm_type> <num_nodes> <output_dir>
+
+#!/bin/bash
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <communication_type> <number_of_nodes> <output_dir>"
+    exit 1
+fi
+# `mpi` or `rccl`
+COMM_TYPE=$1
+# `16` or `64`
+NUM_NODES=$2
+# output directory
+OUTPUT_DIR=$3
+
+OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log
+
+{
+    # reset modules
+    echo resetting modules:
+    module reset
+    # load modules
+    echo loading modules:
+    module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm
+
+    GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks
+    EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x
+    NUM_TASKS=$(($NUM_NODES * 8))
+    MIN_MSG_SIZE=$((1 * 1024))
+    MAX_MSG_SIZE=$((1 * 1024 * 1024))
+    ITERATIONS=100
+
+    export MPICH_GPU_SUPPORT_ENABLED=1
+    export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
+
+    echo start allgather: $(date)
+    For MPI-bench we should use --gpus-per-node --gpus-per-task --ntasks-per-node , and  --gpu-bind=none in srun.
+    CMD="srun -N $NUM_NODES -n $NUM_TASKS \
+            --gpus-per-node 8 \
+            --gpus-per-task 1 \
+            --ntasks-per-node 8 \
+            --gpu-bind none \
+            --output $OUTPUT_FILE \
+            $EXEC $NUM_TASKS $MIN_MSG_SIZE $MAX_MSG_SIZE $ITERATIONS"
+    echo running:
+    echo $CMD
+    $CMD
+    echo end allgather: $(date)
+} >> $OUTPUT_FILE
diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh
new file mode 100644
index 0000000..caafc1a
--- /dev/null
+++ b/gpu-benchmarks/allreduce/run_frontier.sh
@@ -0,0 +1,46 @@
+# This script assumes it is being run by another sbatch script, 
+# so does not include portions for SBATCH vars (e.g. account, time, etc.)
+
+# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allreduce.sh <comm_type> <num_nodes> <output_dir>
+
+#!/bin/bash
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <communication_type> <number_of_nodes> <output_dir>"
+    exit 1
+fi
+# `mpi` or `rccl`
+COMM_TYPE=$1
+# `16` or `64`
+NUM_NODES=$2
+# output directory
+OUTPUT_DIR=$3
+
+OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log
+
+{
+    # reset modules
+    echo resetting modules:
+    module reset
+    # load modules
+    echo loading modules:
+    module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm
+
+    GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks
+    EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x
+    NUM_TASKS=$(($NUM_NODES * 8))
+    MIN_MSG_SIZE=$((1 * 1024))
+    MAX_MSG_SIZE=$((1 * 1024 * 1024))
+    ITERATIONS=100
+
+    export MPICH_GPU_SUPPORT_ENABLED=1
+    export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
+
+    echo start allreduce: $(date)
+    CMD="srun -N $NUM_NODES -n $NUM_TASKS \
+            --output $OUTPUT_FILE \
+            $EXEC $NUM_TASKS $MIN_MSG_SIZE $MAX_MSG_SIZE $ITERATIONS"
+    echo running:
+    echo $CMD
+    $CMD
+    echo end allreduce: $(date)
+} >> $OUTPUT_FILE
diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh
new file mode 100644
index 0000000..6f9bb5b
--- /dev/null
+++ b/gpu-benchmarks/gemm/run_frontier.sh
@@ -0,0 +1,44 @@
+# This script assumes it is being run by another sbatch script, 
+# so does not include portions for SBATCH vars (e.g. account, time, etc.)
+
+# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/gemm.sh <num_nodes> <output_dir>
+
+#!/bin/bash
+if [ "$#" -ne 2 ]; then
+    echo "Usage: $0 <number_of_nodes> <output_dir>"
+    exit 1
+fi
+# `16` or `64`
+NUM_NODES=$1
+# output directory
+OUTPUT_DIR=$2
+
+OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log
+
+{
+    # reset modules
+    echo resetting modules:
+    module reset
+    # load modules
+    echo loading modules:
+    module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm
+
+    GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks
+    EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x
+    NUM_TASKS=$(($NUM_NODES * 8))
+
+    export MPICH_GPU_SUPPORT_ENABLED=1
+    export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
+
+    echo start gemm: $(date)
+    CMD="srun -N $NUM_NODES -n $NUM_TASKS \
+            --gpus-per-node 8 \
+            --gpus-per-task 1 \
+            --ntasks-per-node 8 \
+            --output $OUTPUT_FILE \
+            $EXEC"
+    echo running:
+    echo $CMD
+    $CMD
+    echo end gemm: $(date)
+} >> $OUTPUT_FILE

From d59c821dd5b11603f99e984df12cdb7cf00f8c24 Mon Sep 17 00:00:00 2001
From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com>
Date: Wed, 18 Dec 2024 02:18:39 -0500
Subject: [PATCH 3/9] reformat readme

---
 AMG2023/README.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/AMG2023/README.md b/AMG2023/README.md
index 71af9f6..3e9b90e 100644
--- a/AMG2023/README.md
+++ b/AMG2023/README.md
@@ -1,9 +1,9 @@
 # AMG2023 README
 For more detailed installation parameters, please refer to the [installation document](https://github.com/pssg-int/AMG2023/blob/main/amg-doc.pdf).
 
-## Perlmutter Compilation
+Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/)
 
-Repository: [AMG2023](https://github.com/pssg-int/AMG2023)
+## Perlmutter Compilation
 
 ### Steps to Compile
 
@@ -50,7 +50,10 @@ Repository: [AMG2023](https://github.com/pssg-int/AMG2023)
     cmake -DHYPRE_PREFIX=/pscratch/sd/c/cunyang/AMG2023 ..
     ```
 
-## Frontier Installation
+## Frontier Compilation
+
+### Steps to Compile
+
 1. Load modules
     ```sh
     module reset

From c76505da4eeba3de36130bf3345d88cd4236aaad Mon Sep 17 00:00:00 2001
From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com>
Date: Wed, 25 Dec 2024 01:05:56 -0500
Subject: [PATCH 4/9] update AMG2023 and gpu-benchmarks scripts to use newest
 rocm and cray-mpich versions available on frontier

---
 AMG2023/README.md                        | 15 +++++++++------
 AMG2023/run_frontier_16.sh               | 11 ++++-------
 AMG2023/run_frontier_64.sh               | 10 ++++------
 gpu-benchmarks/allgather/run_frontier.sh |  4 +++-
 gpu-benchmarks/allreduce/run_frontier.sh |  4 +++-
 gpu-benchmarks/gemm/run_frontier.sh      |  4 +++-
 6 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/AMG2023/README.md b/AMG2023/README.md
index 3e9b90e..03832f1 100644
--- a/AMG2023/README.md
+++ b/AMG2023/README.md
@@ -58,24 +58,27 @@ Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/)
     ```sh
     module reset
 
-    module load cray-mpich/8.1.28
+    module load cray-mpich/8.1.30
     module load craype-accel-amd-gfx90a
-    module load rocm
+    module load rocm/6.2.4
     export MPICH_GPU_SUPPORT_ENABLED=1
 
     # load compatible cmake version
     module load Core/24.07
     module load cmake/3.27.9
     ```
-2. Configure hypre
-    - Clone hypre v2.27.0 and navigate to src: 
+2. Configure hypre (v2.32.0)
+    - Clone hypre v2.32.0 and navigate to src: 
         ```sh
-        git clone -b v2.27.0 https://github.com/hypre-space/hypre.git
+        git clone -b v2.32.0 https://github.com/hypre-space/hypre.git
         cd into ~/hypre/src
         ```
     - Configure hypre (in hypre/src)
         ```sh
-        ./configure --with-hip --with-gpu-arch=gfx90a --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" --with-MPI-include="${MPICH_DIR}/include"
+        ./configure --with-hip --enable-device-memory-pool --enable-mixedint --with-gpu-arch=gfx90a \
+	        --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" \
+	        --with-MPI-include="${MPICH_DIR}/include" \
+	        --with-extra-CUFLAGS="-I/opt/rocm-6.2.4/include -I/opt/rocm-6.2.4/include/rocsparse -L/opt/rocm-6.2.4/lib"
         ```
     - Compile hypre (in hypre/src)
         ```sh
diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh
index 8546887..92664c3 100644
--- a/AMG2023/run_frontier_16.sh
+++ b/AMG2023/run_frontier_16.sh
@@ -32,22 +32,19 @@ echo resetting modules:
 module reset
 # load modules
 echo loading modules:
-module load cray-mpich/8.1.28
+module load cray-mpich/8.1.30
 module load craype-accel-amd-gfx90a
-module load rocm
+module load rocm/6.2.4
 
 export MPICH_GPU_SUPPORT_ENABLED=1
 export CRAY_ACCEL_TARGET=gfx90a
 export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/
-export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so
-export MPIP="-f $OUTPUT_DIR"
+export MPIP="-o -f $OUTPUT_DIR"
 
 # log start date
 echo start AMG2023: $(date)
 # define command
-cmd="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \
-        --output $OUTPUT_FILE \
-        --error $ERROR_FILE \
+cmd="srun --output $OUTPUT_FILE --error $ERROR_FILE \
         ./build/amg -P 4 4 8 -n 128 64 64 -problem 1 -iter 500"
 echo solving:
 echo $cmd
diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh
index c28de6a..eb4c6d9 100644
--- a/AMG2023/run_frontier_64.sh
+++ b/AMG2023/run_frontier_64.sh
@@ -32,22 +32,20 @@ echo resetting modules:
 module reset
 # load modules
 echo loading modules:
-module load cray-mpich/8.1.28
+module load cray-mpich/8.1.30
 module load craype-accel-amd-gfx90a
-module load rocm
+module load rocm/6.2.4
 
 export MPICH_GPU_SUPPORT_ENABLED=1
 export CRAY_ACCEL_TARGET=gfx90a
 export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/
 export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so
-export MPIP="-f $OUTPUT_DIR"
+export MPIP="-o -f $OUTPUT_DIR"
 
 # log start date
 echo start AMG2023: $(date)
 # define command
-cmd="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \
-        --output $OUTPUT_FILE \
-        --error $ERROR_FILE \
+cmd="srun --output $OUTPUT_FILE --error $ERROR_FILE \
         ./build/amg -P 8 8 8 -n 128 64 64 -problem 1 -iter 500"
 echo solving:
 echo $cmd
diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh
index dfd7bfe..cb98dd6 100644
--- a/gpu-benchmarks/allgather/run_frontier.sh
+++ b/gpu-benchmarks/allgather/run_frontier.sh
@@ -23,7 +23,9 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log
     module reset
     # load modules
     echo loading modules:
-    module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4
+    module load cray-mpich/8.1.30
+    module load rocm/6.2.4
 
     GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks
     EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x
diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh
index caafc1a..5ac70ea 100644
--- a/gpu-benchmarks/allreduce/run_frontier.sh
+++ b/gpu-benchmarks/allreduce/run_frontier.sh
@@ -23,7 +23,9 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log
     module reset
     # load modules
     echo loading modules:
-    module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4
+    module load cray-mpich/8.1.30
+    module load rocm/6.2.4
 
     GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks
     EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x
diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh
index 6f9bb5b..4ffd5e8 100644
--- a/gpu-benchmarks/gemm/run_frontier.sh
+++ b/gpu-benchmarks/gemm/run_frontier.sh
@@ -21,7 +21,9 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log
     module reset
     # load modules
     echo loading modules:
-    module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4
+    module load cray-mpich/8.1.30
+    module load rocm/6.2.4
 
     GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks
     EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x

From 3d75c0d02d51fa80cf1888a84311c8cb3e5c2a3d Mon Sep 17 00:00:00 2001
From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com>
Date: Wed, 25 Dec 2024 23:54:08 -0500
Subject: [PATCH 5/9] nanogpt scripts

---
 nanoGPT/README.md                      | 73 ++++++++++++++-------
 nanoGPT/run_frontier16.sh              | 90 ++++++++++++++++++++++++++
 nanoGPT/run_frontier64.sh              | 90 ++++++++++++++++++++++++++
 nanoGPT/run_frontier_crontab.sh        | 19 ++++++
 nanoGPT/train_gpt_neox_20B_frontier.py | 46 +++++++++++++
 nanoGPT/train_gpt_neox_5B_frontier.py  | 46 +++++++++++++
 6 files changed, 342 insertions(+), 22 deletions(-)
 create mode 100644 nanoGPT/run_frontier16.sh
 create mode 100644 nanoGPT/run_frontier64.sh
 create mode 100644 nanoGPT/run_frontier_crontab.sh
 create mode 100644 nanoGPT/train_gpt_neox_20B_frontier.py
 create mode 100644 nanoGPT/train_gpt_neox_5B_frontier.py

diff --git a/nanoGPT/README.md b/nanoGPT/README.md
index 5c499fc..87e8189 100644
--- a/nanoGPT/README.md
+++ b/nanoGPT/README.md
@@ -1,33 +1,62 @@
-# nanoGPT Setup Instructions
+# nanoGPT README
+For more detailed installation parameters, please refer to [nanoGPT install guide](https://github.com/axonn-ai/nanoGPT).
 
-## Clone the Repository
+Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/)
 
-```sh
-git clone https://github.com/axonn-ai/nanoGPT.git
-```
 
-## Create Python Environment
+## Perlmutter Setup
 
-```sh
-./scripts/create_python_env_perlmutter.sh
-```
+### Setup steps
 
-> Note: You may need to modify the path and torch version in `create_python_env_perlmutter.sh`.
+1. Clone the Repository
+    ```sh
+    git clone https://github.com/axonn-ai/nanoGPT.git
+    cd nanoGPT
+    ```
 
-## Load PyTorch Module
+2.  Create Python Environment
+    ```sh
+    ./scripts/create_python_env_perlmutter.sh
+    ```
+    > Note: You may need to modify the path and torch version in `create_python_env_perlmutter.sh`.
 
-```sh
-module load pytorch/2.0.1
-```
+3. Load PyTorch Module
+    ```sh
+    module load pytorch/2.0.1
+    ```
 
-## Activate the Environment
+4. Activate the Environment
+    ```sh
+    source path_to_nanogptENV/bin/activate
+    ```
 
-```sh
-source path_to_nanogptENV/bin/activate
-```
+5. Download Data
+    ```sh
+    python nanoGPT/data/openwebtext/prepare.py
+    ```
 
-## Download Data
+## Frontier Setup
 
-```sh
-python nanoGPT/data/openwebtext/prepare.py
-```
\ No newline at end of file
+### Setup steps
+
+1. Clone the Repository
+    ```sh
+    git clone https://github.com/axonn-ai/nanoGPT.git
+    cd nanoGPT
+    ```
+
+2.  Create Python Environment
+    ```sh
+    ./scripts/create_python_env_frontier.sh
+    ```
+    > Note: You may need to modify the WKSPC path and torch version in `create_python_env_frontier.sh`.
+
+4. Activate the Environment
+    ```sh
+    source path_to_nanogptENV/bin/activate
+    ```
+
+5. Download Data
+    ```sh
+    python data/openwebtext/prepare.py
+    ```
\ No newline at end of file
diff --git a/nanoGPT/run_frontier16.sh b/nanoGPT/run_frontier16.sh
new file mode 100644
index 0000000..63718c5
--- /dev/null
+++ b/nanoGPT/run_frontier16.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+#SBATCH -N 16
+#SBATCH -n 128
+#SBATCH -q normal
+#SBATCH -J nanogpt
+#SBATCH -t 01:00:00
+#SBATCH -A csc569
+#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-output.log
+#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-error.log
+#SBATCH --exclusive
+# Run like: sbatch run_frontier16.sh
+
+export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
+OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log
+ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log
+
+# Run gpu benchmarks
+COMM_TYPE=rccl
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+
+APP_ROOT=/lustre/orion/csc569/scratch/keshprad/nanoGPT
+cd $APP_ROOT
+
+export SCRATCH="/lustre/orion/csc569/scratch/keshprad"
+export WRKSPC="${SCRATCH}/nanoGPT"
+export HF_HOME="${SCRATCH}/.cache/hf"
+export HF_TRANSFORMERS_CACHE="${HF_HOME}"
+export HF_DATASETS_CACHE="${HF_HOME}/datasets"
+cd $WRKSPC
+
+# load modules
+rocm_version=6.1.3
+module reset
+module load PrgEnv-gnu/8.5.0
+module load rocm/${rocm_version}
+module load craype-accel-amd-gfx90a
+module load cray-python/3.9.13.1
+module load gcc-native/12.3
+module load cray-mpich/8.1.30
+# activate env
+source ${WRKSPC}/axonn_nanogpt/bin/activate
+
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 8 ))
+## master addr and port
+export MASTER_ADDR=$(hostname -i)
+export MASTER_PORT=3442
+export WORLD_SIZE=${GPUS}
+
+## nccl env vars to speedup stuff
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_NET_GDR_LEVEL=PHB
+export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn0
+export NCCL_NET="AWS Libfabric"
+export NCCL_TIMEOUT=1200
+export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=1200
+export MPICH_GPU_SUPPORT_ENABLED=0
+# AWS-OFI-RCCL
+export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH"
+
+SCRIPT="train_frontier.py config/train_gpt_neox_5B.py"
+
+# run without profiler
+export WITH_PROFILER=0
+# log start date
+echo start nanoGPT_withoutprof: $(date)
+run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT_withoutprof.log"
+echo $run_cmd
+eval $run_cmd
+# log end date
+echo end nanoGPT_withoutprof: $(date)
+
+
+# run with profiler
+export WITH_PROFILER=1
+# log start date
+echo start nanoGPT: $(date)
+run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT.log"
+echo $run_cmd
+eval $run_cmd
+# log end date
+echo end nanoGPT: $(date)
diff --git a/nanoGPT/run_frontier64.sh b/nanoGPT/run_frontier64.sh
new file mode 100644
index 0000000..1c9a75b
--- /dev/null
+++ b/nanoGPT/run_frontier64.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+#SBATCH -N 64
+#SBATCH -n 512
+#SBATCH -q normal
+#SBATCH -J nanogpt
+#SBATCH -t 01:00:00
+#SBATCH -A csc569
+#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-output.log
+#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-error.log
+#SBATCH --exclusive
+# Run like: sbatch run_frontier64.sh
+
+export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
+OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log
+ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log
+
+# Run gpu benchmarks
+COMM_TYPE=rccl
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+
+APP_ROOT=/lustre/orion/csc569/scratch/keshprad/nanoGPT
+cd $APP_ROOT
+
+export SCRATCH="/lustre/orion/csc569/scratch/keshprad"
+export WRKSPC="${SCRATCH}/nanoGPT"
+export HF_HOME="${SCRATCH}/.cache/hf"
+export HF_TRANSFORMERS_CACHE="${HF_HOME}"
+export HF_DATASETS_CACHE="${HF_HOME}/datasets"
+cd $WRKSPC
+
+# load modules
+rocm_version=6.1.3
+module reset
+module load PrgEnv-gnu/8.5.0
+module load rocm/${rocm_version}
+module load craype-accel-amd-gfx90a
+module load cray-python/3.9.13.1
+module load gcc-native/12.3
+module load cray-mpich/8.1.30
+# activate env
+source ${WRKSPC}/axonn_nanogpt/bin/activate
+
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 8 ))
+## master addr and port
+export MASTER_ADDR=$(hostname -i)
+export MASTER_PORT=3442
+export WORLD_SIZE=${GPUS}
+
+## nccl env vars to speedup stuff
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_NET_GDR_LEVEL=PHB
+export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn0
+export NCCL_NET="AWS Libfabric"
+export NCCL_TIMEOUT=1200
+export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=1200
+export MPICH_GPU_SUPPORT_ENABLED=0
+# AWS-OFI-RCCL
+export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH"
+
+SCRIPT="train_frontier.py config/train_gpt_neox_20B.py"
+
+# run without profiler
+export WITH_PROFILER=0
+# log start date
+echo start nanoGPT_withoutprof: $(date)
+run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT_withoutprof.log"
+echo $run_cmd
+eval $run_cmd
+# log end date
+echo end nanoGPT_withoutprof: $(date)
+
+
+# run with profiler
+export WITH_PROFILER=1
+# log start date
+echo start nanoGPT: $(date)
+run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT.log"
+echo $run_cmd
+eval $run_cmd
+# log end date
+echo end nanoGPT: $(date)
diff --git a/nanoGPT/run_frontier_crontab.sh b/nanoGPT/run_frontier_crontab.sh
new file mode 100644
index 0000000..dcc8cf5
--- /dev/null
+++ b/nanoGPT/run_frontier_crontab.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <number_of_nodes>"
+    exit 1
+fi
+# `16` or `64`
+NUM_NODES=$1
+
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+
+# load lmod
+source /usr/share/lmod/lmod/init/bash
+# load default LMOD_SYSTEM_DEFAULT_MODULES and MODULEPATH
+export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-trento:craype-network-ofi:perftools-base:xpmem:cray-pmi:PrgEnv-cray:DefApps
+export MODULEPATH=/sw/frontier/spack-envs/modules/cce/17.0.0/cray-mpich-8.1.28/cce-17.0.0:/sw/frontier/spack-envs/modules/cce/17.0.0/cce-17.0.0:/sw/frontier/spack-envs/modules/Core/24.07:/opt/cray/pe/lmod/modulefiles/mpi/crayclang/17.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/crayclang/17.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/compiler/crayclang/17.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-trento/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/sw/frontier/modulefiles:/opt/cray/modulefiles
+
+# run sbatch script
+script=$PERF_VARIABILITY_ROOT/nanoGPT/run_frontier$NUM_NODES\.sh
+sbatch $script
\ No newline at end of file
diff --git a/nanoGPT/train_gpt_neox_20B_frontier.py b/nanoGPT/train_gpt_neox_20B_frontier.py
new file mode 100644
index 0000000..cf7b91f
--- /dev/null
+++ b/nanoGPT/train_gpt_neox_20B_frontier.py
@@ -0,0 +1,46 @@
+# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB
+# launch as the following (e.g. in a screen session) and wait ~5 days:
+# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py
+
+wandb_log = False
+wandb_project = 'owt'
+wandb_run_name='gpt2-124M'
+
+# these make the total batch size be ~0.5M
+# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
+batch_size = 8
+block_size = 512
+gradient_accumulation_steps = 1 * 512 #per_gpu x num_gpus
+
+# model
+n_layer = 32
+n_head = 56
+n_embd = 7168
+dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
+bias = False # do we use bias inside LayerNorm and Linear layers?
+
+# adamw optimizer
+learning_rate = 1e-4 # max learning rate
+max_iters = 30 # total number of training iterations
+
+# axonn params
+G_intra_d=16
+G_intra_c=1
+G_intra_r=1
+compile=False # disable compile for axonn
+gradient_checkpointing=True
+
+# this makes total number of tokens be 300B
+max_iters = 30
+lr_decay_iters = 600000
+
+# eval stuff
+eval_interval = 1000
+eval_iters = 1
+log_interval = 10
+
+# weight decay
+weight_decay = 1e-1
+
+# log every iteration
+log_interval=1
\ No newline at end of file
diff --git a/nanoGPT/train_gpt_neox_5B_frontier.py b/nanoGPT/train_gpt_neox_5B_frontier.py
new file mode 100644
index 0000000..5fcc430
--- /dev/null
+++ b/nanoGPT/train_gpt_neox_5B_frontier.py
@@ -0,0 +1,46 @@
+# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB
+# launch as the following (e.g. in a screen session) and wait ~5 days:
+# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py
+
+wandb_log = False
+wandb_project = 'owt'
+wandb_run_name='gpt2-124M'
+
+# these make the total batch size be ~0.5M
+# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
+batch_size = 32
+block_size = 512
+gradient_accumulation_steps = 1 * 128 #per_gpu x num_gpus
+
+# model
+n_layer = 24
+n_head = 32
+n_embd = 4096
+dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
+bias = False # do we use bias inside LayerNorm and Linear layers?
+
+# adamw optimizer
+learning_rate = 1e-4 # max learning rate
+max_iters = 30 # total number of training iterations
+
+# axonn params
+G_intra_d=16
+G_intra_c=1
+G_intra_r=1
+compile=False # disable compile for axonn
+gradient_checkpointing=True
+
+# this makes total number of tokens be 300B
+max_iters = 30
+lr_decay_iters = 600000
+
+# eval stuff
+eval_interval = 1000
+eval_iters = 1
+log_interval = 10
+
+# weight decay
+weight_decay = 1e-1
+
+# log every iteration
+log_interval=1
\ No newline at end of file

From 7e8749e901dfdec0a55ad4d6b15a10816cc837a6 Mon Sep 17 00:00:00 2001
From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com>
Date: Fri, 27 Dec 2024 04:45:42 -0500
Subject: [PATCH 6/9] updated AMG2023 and gpu-benchmarks run scripts

---
 AMG2023/README.md                        | 17 ++++++++++++-----
 AMG2023/run_frontier_16.sh               |  4 +++-
 AMG2023/run_frontier_64.sh               |  4 +++-
 gpu-benchmarks/README.md                 | 14 ++++++++++++++
 gpu-benchmarks/allgather/run_frontier.sh |  6 +++---
 gpu-benchmarks/allreduce/run_frontier.sh |  6 +++---
 gpu-benchmarks/gemm/run_frontier.sh      |  6 +++---
 7 files changed, 41 insertions(+), 16 deletions(-)
 create mode 100644 gpu-benchmarks/README.md

diff --git a/AMG2023/README.md b/AMG2023/README.md
index 03832f1..14c75c8 100644
--- a/AMG2023/README.md
+++ b/AMG2023/README.md
@@ -60,7 +60,7 @@ Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/)
 
     module load cray-mpich/8.1.30
     module load craype-accel-amd-gfx90a
-    module load rocm/6.2.4
+    module load rocm/6.1.3
     export MPICH_GPU_SUPPORT_ENABLED=1
 
     # load compatible cmake version
@@ -76,9 +76,10 @@ Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/)
     - Configure hypre (in hypre/src)
         ```sh
         ./configure --with-hip --enable-device-memory-pool --enable-mixedint --with-gpu-arch=gfx90a \
-	        --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" \
-	        --with-MPI-include="${MPICH_DIR}/include" \
-	        --with-extra-CUFLAGS="-I/opt/rocm-6.2.4/include -I/opt/rocm-6.2.4/include/rocsparse -L/opt/rocm-6.2.4/lib"
+            --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" \
+            --with-MPI-include="${MPICH_DIR}/include" \
+            CFLAGS="-I${ROCM_PATH}/include/ -I${ROCM_PATH}/llvm/include/ -I${ROCM_PATH}/include/rocsparse/" \
+            LDFLAGS="-L${ROCM_PATH}/lib/ -L${ROCM_PATH}/llvm/lib/ -lrocsparse"
         ```
     - Compile hypre (in hypre/src)
         ```sh
@@ -91,11 +92,17 @@ Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/)
         git clone https://github.com/pssg-int/AMG2023`
         cd AMG2023
         ```
+    - Add mpiP to LD_LIBRARY_PATH
+        ```sh
+        export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH
+        ```
     - Configure cmake
         ```sh
         mkdir build && cd build
 
-        cmake .. -DHYPRE_PREFIX=/ccs/home/keshprad/hypre/src/hypre/ -DCMAKE_EXE_LINKER_FLAGS="-lrocsparse -lrocrand"
+        cmake .. -DHYPRE_PREFIX=/ccs/home/keshprad/hypre/src/hypre/ \
+            -DCMAKE_C_FLAGS="-I${ROCM_PATH}/include/ -I${ROCM_PATH}/llvm/include/ -I${ROCM_PATH}/include/rocsparse/" \
+            -DCMAKE_EXE_LINKER_FLAGS="-L${ROCM_PATH}/lib/ -L${ROCM_PATH}/llvm/lib/ -lrocsparse -lrocrand"
         ```
     - Compile AMG2023 (in AMG2023/build)
         ```sh
diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh
index 92664c3..d635c31 100644
--- a/AMG2023/run_frontier_16.sh
+++ b/AMG2023/run_frontier_16.sh
@@ -34,11 +34,13 @@ module reset
 echo loading modules:
 module load cray-mpich/8.1.30
 module load craype-accel-amd-gfx90a
-module load rocm/6.2.4
+module load rocm/6.1.3
 
 export MPICH_GPU_SUPPORT_ENABLED=1
 export CRAY_ACCEL_TARGET=gfx90a
 export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/
+# mpiP
+export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH
 export MPIP="-o -f $OUTPUT_DIR"
 
 # log start date
diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh
index eb4c6d9..8854ca1 100644
--- a/AMG2023/run_frontier_64.sh
+++ b/AMG2023/run_frontier_64.sh
@@ -34,12 +34,14 @@ module reset
 echo loading modules:
 module load cray-mpich/8.1.30
 module load craype-accel-amd-gfx90a
-module load rocm/6.2.4
+module load rocm/6.1.3
 
 export MPICH_GPU_SUPPORT_ENABLED=1
 export CRAY_ACCEL_TARGET=gfx90a
 export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/
 export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so
+# mpiP
+export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH
 export MPIP="-o -f $OUTPUT_DIR"
 
 # log start date
diff --git a/gpu-benchmarks/README.md b/gpu-benchmarks/README.md
new file mode 100644
index 0000000..c8f9c25
--- /dev/null
+++ b/gpu-benchmarks/README.md
@@ -0,0 +1,14 @@
+# gpu-benchmarks README
+Code Repository: [gpu-benchmarks](#TODO:)
+
+## Perlmutter Compilation
+
+### Steps to Compile
+
+TODO:
+
+## Frontier Compilation
+
+### Steps to Compile
+
+TODO:
\ No newline at end of file
diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh
index cb98dd6..75216e8 100644
--- a/gpu-benchmarks/allgather/run_frontier.sh
+++ b/gpu-benchmarks/allgather/run_frontier.sh
@@ -23,11 +23,11 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log
     module reset
     # load modules
     echo loading modules:
-    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3
     module load cray-mpich/8.1.30
-    module load rocm/6.2.4
+    module load rocm/6.1.3
 
-    GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks
+    GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
     EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x
     NUM_TASKS=$(($NUM_NODES * 8))
     MIN_MSG_SIZE=$((1 * 1024))
diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh
index 5ac70ea..729c539 100644
--- a/gpu-benchmarks/allreduce/run_frontier.sh
+++ b/gpu-benchmarks/allreduce/run_frontier.sh
@@ -23,11 +23,11 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log
     module reset
     # load modules
     echo loading modules:
-    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3
     module load cray-mpich/8.1.30
-    module load rocm/6.2.4
+    module load rocm/6.1.3
 
-    GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks
+    GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
     EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x
     NUM_TASKS=$(($NUM_NODES * 8))
     MIN_MSG_SIZE=$((1 * 1024))
diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh
index 4ffd5e8..d089dd1 100644
--- a/gpu-benchmarks/gemm/run_frontier.sh
+++ b/gpu-benchmarks/gemm/run_frontier.sh
@@ -21,11 +21,11 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log
     module reset
     # load modules
     echo loading modules:
-    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3
     module load cray-mpich/8.1.30
-    module load rocm/6.2.4
+    module load rocm/6.1.3
 
-    GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks
+    GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
     EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x
     NUM_TASKS=$(($NUM_NODES * 8))
 

From 56e0fd5a4b4f47b807fe6db75cfeb1ab4c5476a9 Mon Sep 17 00:00:00 2001
From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com>
Date: Sat, 28 Dec 2024 21:56:25 -0500
Subject: [PATCH 7/9] use gpu-bind=none for frontier

---
 AMG2023/run_frontier_16.sh               | 1 +
 AMG2023/run_frontier_64.sh               | 2 +-
 gpu-benchmarks/allgather/run_frontier.sh | 3 ++-
 gpu-benchmarks/allreduce/run_frontier.sh | 3 ++-
 gpu-benchmarks/gemm/run_frontier.sh      | 3 ++-
 5 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh
index d635c31..c0a69b0 100644
--- a/AMG2023/run_frontier_16.sh
+++ b/AMG2023/run_frontier_16.sh
@@ -3,6 +3,7 @@
 #SBATCH -n 128
 #SBATCH -q normal
 #SBATCH -J amg
+#SBATCH --gpu-bind none
 #SBATCH -t 00:30:00
 #SBATCH -A csc569
 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/%x-%j/output-AMG2023.log
diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh
index 8854ca1..8baabe8 100644
--- a/AMG2023/run_frontier_64.sh
+++ b/AMG2023/run_frontier_64.sh
@@ -3,6 +3,7 @@
 #SBATCH -n 512
 #SBATCH -q normal
 #SBATCH -J amg
+#SBATCH --gpu-bind none
 #SBATCH -t 00:30:00
 #SBATCH -A csc569
 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/%x-%j/output-AMG2023.log
@@ -39,7 +40,6 @@ module load rocm/6.1.3
 export MPICH_GPU_SUPPORT_ENABLED=1
 export CRAY_ACCEL_TARGET=gfx90a
 export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/
-export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so
 # mpiP
 export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH
 export MPIP="-o -f $OUTPUT_DIR"
diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh
index 75216e8..79cedc7 100644
--- a/gpu-benchmarks/allgather/run_frontier.sh
+++ b/gpu-benchmarks/allgather/run_frontier.sh
@@ -26,6 +26,7 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log
     module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3
     module load cray-mpich/8.1.30
     module load rocm/6.1.3
+    module list
 
     GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
     EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x
@@ -50,4 +51,4 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log
     echo $CMD
     $CMD
     echo end allgather: $(date)
-} >> $OUTPUT_FILE
+} &>> $OUTPUT_FILE
diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh
index 729c539..56bd2fe 100644
--- a/gpu-benchmarks/allreduce/run_frontier.sh
+++ b/gpu-benchmarks/allreduce/run_frontier.sh
@@ -26,6 +26,7 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log
     module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3
     module load cray-mpich/8.1.30
     module load rocm/6.1.3
+    module list
 
     GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
     EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x
@@ -45,4 +46,4 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log
     echo $CMD
     $CMD
     echo end allreduce: $(date)
-} >> $OUTPUT_FILE
+} &>> $OUTPUT_FILE
diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh
index d089dd1..9ccecbd 100644
--- a/gpu-benchmarks/gemm/run_frontier.sh
+++ b/gpu-benchmarks/gemm/run_frontier.sh
@@ -24,6 +24,7 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log
     module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3
     module load cray-mpich/8.1.30
     module load rocm/6.1.3
+    module list
 
     GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
     EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x
@@ -43,4 +44,4 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log
     echo $CMD
     $CMD
     echo end gemm: $(date)
-} >> $OUTPUT_FILE
+} &>> $OUTPUT_FILE

From a087255bc2a22e152a5508f3103c88bfc7847ebc Mon Sep 17 00:00:00 2001
From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com>
Date: Mon, 13 Jan 2025 17:28:22 -0500
Subject: [PATCH 8/9] update gpu-benchmarks to specify ROCM version

---
 AMG2023/run_frontier_16.sh               |  7 ++++---
 AMG2023/run_frontier_64.sh               |  7 ++++---
 gpu-benchmarks/allgather/run_frontier.sh | 25 ++++++++++++++++--------
 gpu-benchmarks/allreduce/run_frontier.sh | 25 ++++++++++++++++--------
 gpu-benchmarks/gemm/run_frontier.sh      | 25 ++++++++++++++++--------
 5 files changed, 59 insertions(+), 30 deletions(-)

diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh
index c0a69b0..c51b52d 100644
--- a/AMG2023/run_frontier_16.sh
+++ b/AMG2023/run_frontier_16.sh
@@ -17,13 +17,14 @@ ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log
 
 # Run gpu benchmarks
 COMM_TYPE=mpi
+ROCM_VERSION=6.1.3
 PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
 echo running allreduce benchmark
-bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
 # echo running allgather benchmark
-# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
 echo running gemm benchmark
-bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
 
 APP_ROOT=/ccs/home/keshprad/AMG2023
 cd $APP_ROOT
diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh
index 8baabe8..c7a7a3e 100644
--- a/AMG2023/run_frontier_64.sh
+++ b/AMG2023/run_frontier_64.sh
@@ -17,13 +17,14 @@ ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log
 
 # Run gpu benchmarks
 COMM_TYPE=mpi
+ROCM_VERSION=6.1.3
 PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
 echo running allreduce benchmark
-bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
 # echo running allgather benchmark
-# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
 echo running gemm benchmark
-bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
 
 APP_ROOT=/ccs/home/keshprad/AMG2023
 cd $APP_ROOT
diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh
index 79cedc7..7fc10b4 100644
--- a/gpu-benchmarks/allgather/run_frontier.sh
+++ b/gpu-benchmarks/allgather/run_frontier.sh
@@ -4,16 +4,25 @@
 # run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allgather.sh <comm_type> <num_nodes> <output_dir>
 
 #!/bin/bash
-if [ "$#" -ne 3 ]; then
-    echo "Usage: $0 <communication_type> <number_of_nodes> <output_dir>"
+if [ "$#" -ne 4 ]; then
+    echo "Usage: $0 <communication_type> <rocm_version> <number_of_nodes> <output_dir>"
     exit 1
 fi
 # `mpi` or `rccl`
 COMM_TYPE=$1
+# `5.7.1` or `6.1.3`
+ROCM_VERSION=$2
 # `16` or `64`
-NUM_NODES=$2
+NUM_NODES=$3
 # output directory
-OUTPUT_DIR=$3
+OUTPUT_DIR=$4
+
+# setup cray-mpich version
+if [[ "$ROCM_VERSION" == "6.1.3" ]]; then
+    MPICH_VERSION=8.1.30
+else
+    MPICH_VERSION=8.1.28
+fi
 
 OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log
 
@@ -23,13 +32,13 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log
     module reset
     # load modules
     echo loading modules:
-    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3
-    module load cray-mpich/8.1.30
-    module load rocm/6.1.3
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION}
+    module load cray-mpich/${MPICH_VERSION}
+    module load rocm/${ROCM_VERSION}
     module list
 
     GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
-    EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x
+    EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE\_rocm-${ROCM_VERSION}.x
     NUM_TASKS=$(($NUM_NODES * 8))
     MIN_MSG_SIZE=$((1 * 1024))
     MAX_MSG_SIZE=$((1 * 1024 * 1024))
diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh
index 56bd2fe..855a486 100644
--- a/gpu-benchmarks/allreduce/run_frontier.sh
+++ b/gpu-benchmarks/allreduce/run_frontier.sh
@@ -4,16 +4,25 @@
 # run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allreduce.sh <comm_type> <num_nodes> <output_dir>
 
 #!/bin/bash
-if [ "$#" -ne 3 ]; then
-    echo "Usage: $0 <communication_type> <number_of_nodes> <output_dir>"
+if [ "$#" -ne 4 ]; then
+    echo "Usage: $0 <communication_type> <rocm_version> <number_of_nodes> <output_dir>"
     exit 1
 fi
 # `mpi` or `rccl`
 COMM_TYPE=$1
+# `5.7.1` or `6.1.3`
+ROCM_VERSION=$2
 # `16` or `64`
-NUM_NODES=$2
+NUM_NODES=$3
 # output directory
-OUTPUT_DIR=$3
+OUTPUT_DIR=$4
+
+# setup cray-mpich version
+if [[ "$ROCM_VERSION" == "6.1.3" ]]; then
+    MPICH_VERSION=8.1.30
+else
+    MPICH_VERSION=8.1.28
+fi
 
 OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log
 
@@ -23,13 +32,13 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log
     module reset
     # load modules
     echo loading modules:
-    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3
-    module load cray-mpich/8.1.30
-    module load rocm/6.1.3
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION}
+    module load cray-mpich/${MPICH_VERSION}
+    module load rocm/${ROCM_VERSION}
     module list
 
     GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
-    EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x
+    EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE\_rocm-${ROCM_VERSION}.x
     NUM_TASKS=$(($NUM_NODES * 8))
     MIN_MSG_SIZE=$((1 * 1024))
     MAX_MSG_SIZE=$((1 * 1024 * 1024))
diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh
index 9ccecbd..c5348be 100644
--- a/gpu-benchmarks/gemm/run_frontier.sh
+++ b/gpu-benchmarks/gemm/run_frontier.sh
@@ -4,14 +4,23 @@
 # run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/gemm.sh <num_nodes> <output_dir>
 
 #!/bin/bash
-if [ "$#" -ne 2 ]; then
-    echo "Usage: $0 <number_of_nodes> <output_dir>"
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <rocm_version> <number_of_nodes> <output_dir>"
     exit 1
 fi
+# `5.7.1` or `6.1.3`
+ROCM_VERSION=$1
 # `16` or `64`
-NUM_NODES=$1
+NUM_NODES=$2
 # output directory
-OUTPUT_DIR=$2
+OUTPUT_DIR=$3
+
+# setup cray-mpich version
+if [[ "$ROCM_VERSION" == "6.1.3" ]]; then
+    MPICH_VERSION=8.1.30
+else
+    MPICH_VERSION=8.1.28
+fi
 
 OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log
 
@@ -21,13 +30,13 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log
     module reset
     # load modules
     echo loading modules:
-    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3
-    module load cray-mpich/8.1.30
-    module load rocm/6.1.3
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION}
+    module load cray-mpich/${MPICH_VERSION}
+    module load rocm/${ROCM_VERSION}
     module list
 
     GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
-    EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x
+    EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm_rocm-${ROCM_VERSION}.x
     NUM_TASKS=$(($NUM_NODES * 8))
 
     export MPICH_GPU_SUPPORT_ENABLED=1

From a68f2a16f3b0a1cded21ddf8b7cafd46c293527a Mon Sep 17 00:00:00 2001
From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com>
Date: Mon, 13 Jan 2025 17:36:47 -0500
Subject: [PATCH 9/9] updated nanogpt scripts and reduced batch size due to HIP
 OOM errors

---
 nanoGPT/run_frontier16.sh             | 84 +++++++++++++--------------
 nanoGPT/run_frontier64.sh             | 84 +++++++++++++--------------
 nanoGPT/train_gpt_neox_5B_frontier.py |  4 +-
 3 files changed, 82 insertions(+), 90 deletions(-)

diff --git a/nanoGPT/run_frontier16.sh b/nanoGPT/run_frontier16.sh
index 63718c5..901561e 100644
--- a/nanoGPT/run_frontier16.sh
+++ b/nanoGPT/run_frontier16.sh
@@ -3,30 +3,19 @@
 #SBATCH -n 128
 #SBATCH -q normal
 #SBATCH -J nanogpt
-#SBATCH -t 01:00:00
+#SBATCH --gpu-bind none
+#SBATCH -t 00:30:00
 #SBATCH -A csc569
 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-output.log
 #SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-error.log
 #SBATCH --exclusive
 # Run like: sbatch run_frontier16.sh
 
+echo "start run: $(date)"
 export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
 OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log
 ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log
 
-# Run gpu benchmarks
-COMM_TYPE=rccl
-PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
-echo running allreduce benchmark
-bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
-# echo running allgather benchmark
-# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
-echo running gemm benchmark
-bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
-
-APP_ROOT=/lustre/orion/csc569/scratch/keshprad/nanoGPT
-cd $APP_ROOT
-
 export SCRATCH="/lustre/orion/csc569/scratch/keshprad"
 export WRKSPC="${SCRATCH}/nanoGPT"
 export HF_HOME="${SCRATCH}/.cache/hf"
@@ -35,56 +24,63 @@ export HF_DATASETS_CACHE="${HF_HOME}/datasets"
 cd $WRKSPC
 
 # load modules
-rocm_version=6.1.3
+ROCM_VERSION=6.1.3
+echo resetting modules:
 module reset
+echo loading modules:
 module load PrgEnv-gnu/8.5.0
-module load rocm/${rocm_version}
+module load rocm/${ROCM_VERSION}
 module load craype-accel-amd-gfx90a
 module load cray-python/3.9.13.1
-module load gcc-native/12.3
 module load cray-mpich/8.1.30
+module list
 # activate env
 source ${WRKSPC}/axonn_nanogpt/bin/activate
 
 NNODES=$SLURM_JOB_NUM_NODES
 GPUS=$(( NNODES * 8 ))
 ## master addr and port
-export MASTER_ADDR=$(hostname -i)
-export MASTER_PORT=3442
-export WORLD_SIZE=${GPUS}
+# setting variables for torch.distributed
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export WORLD_SIZE=$GPUS
+export OMP_NUM_THREADS=7
 
-## nccl env vars to speedup stuff
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NCCL_NET_GDR_LEVEL=PHB
-export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0
+## some RCCL env variables
+export FI_CXI_ATS=0
+export HSA_FORCE_FINE_GRAIN_PCIE=1
 export NCCL_CROSS_NIC=1
 export NCCL_SOCKET_IFNAME=hsn0
-export NCCL_NET="AWS Libfabric"
-export NCCL_TIMEOUT=1200
-export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=1200
-export MPICH_GPU_SUPPORT_ENABLED=0
+export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0
+export CUDA_DEVICE_MAX_CONNECTIONS=1
 # AWS-OFI-RCCL
 export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH"
+# other
+export MPICH_GPU_SUPPORT_ENABLED=1
+export GPU_MAX_HW_QUEUES=1
+export OFI_NCCL_USE_IPV6_TCP=1
 
 SCRIPT="train_frontier.py config/train_gpt_neox_5B.py"
 
-# run without profiler
-export WITH_PROFILER=0
-# log start date
-echo start nanoGPT_withoutprof: $(date)
-run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT_withoutprof.log"
-echo $run_cmd
-eval $run_cmd
-# log end date
-echo end nanoGPT_withoutprof: $(date)
-
-
 # run with profiler
 export WITH_PROFILER=1
+OUTPUT_FILE="$JOB_OUTPUT_PATH/output-nanoGPT.log"
 # log start date
-echo start nanoGPT: $(date)
-run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT.log"
-echo $run_cmd
-eval $run_cmd
+echo "start nanoGPT: $(date)" &>> $OUTPUT_FILE
+run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT"
+echo $run_cmd &>> $OUTPUT_FILE
+eval $run_cmd &>> $OUTPUT_FILE
 # log end date
-echo end nanoGPT: $(date)
+echo "end nanoGPT: $(date)" &>> $OUTPUT_FILE
+
+# Run gpu benchmarks
+COMM_TYPE=rccl
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+
+echo "end run: $(date)"
\ No newline at end of file
diff --git a/nanoGPT/run_frontier64.sh b/nanoGPT/run_frontier64.sh
index 1c9a75b..3201b51 100644
--- a/nanoGPT/run_frontier64.sh
+++ b/nanoGPT/run_frontier64.sh
@@ -3,30 +3,19 @@
 #SBATCH -n 512
 #SBATCH -q normal
 #SBATCH -J nanogpt
-#SBATCH -t 01:00:00
+#SBATCH --gpu-bind none
+#SBATCH -t 00:30:00
 #SBATCH -A csc569
 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-output.log
 #SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-error.log
 #SBATCH --exclusive
 # Run like: sbatch run_frontier64.sh
 
+echo "start run: $(date)"
 export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
 OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log
 ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log
 
-# Run gpu benchmarks
-COMM_TYPE=rccl
-PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
-echo running allreduce benchmark
-bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
-# echo running allgather benchmark
-# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
-echo running gemm benchmark
-bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
-
-APP_ROOT=/lustre/orion/csc569/scratch/keshprad/nanoGPT
-cd $APP_ROOT
-
 export SCRATCH="/lustre/orion/csc569/scratch/keshprad"
 export WRKSPC="${SCRATCH}/nanoGPT"
 export HF_HOME="${SCRATCH}/.cache/hf"
@@ -35,56 +24,63 @@ export HF_DATASETS_CACHE="${HF_HOME}/datasets"
 cd $WRKSPC
 
 # load modules
-rocm_version=6.1.3
+ROCM_VERSION=6.1.3
+echo resetting modules:
 module reset
+echo loading modules:
 module load PrgEnv-gnu/8.5.0
-module load rocm/${rocm_version}
+module load rocm/${ROCM_VERSION}
 module load craype-accel-amd-gfx90a
 module load cray-python/3.9.13.1
-module load gcc-native/12.3
 module load cray-mpich/8.1.30
+module list
 # activate env
 source ${WRKSPC}/axonn_nanogpt/bin/activate
 
 NNODES=$SLURM_JOB_NUM_NODES
 GPUS=$(( NNODES * 8 ))
 ## master addr and port
-export MASTER_ADDR=$(hostname -i)
-export MASTER_PORT=3442
-export WORLD_SIZE=${GPUS}
+# setting variables for torch.distributed
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export WORLD_SIZE=$GPUS
+export OMP_NUM_THREADS=7
 
-## nccl env vars to speedup stuff
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NCCL_NET_GDR_LEVEL=PHB
-export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0
+## some RCCL env variables
+export FI_CXI_ATS=0
+export HSA_FORCE_FINE_GRAIN_PCIE=1
 export NCCL_CROSS_NIC=1
 export NCCL_SOCKET_IFNAME=hsn0
-export NCCL_NET="AWS Libfabric"
-export NCCL_TIMEOUT=1200
-export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=1200
-export MPICH_GPU_SUPPORT_ENABLED=0
+export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0
+export CUDA_DEVICE_MAX_CONNECTIONS=1
 # AWS-OFI-RCCL
 export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH"
+# other
+export MPICH_GPU_SUPPORT_ENABLED=1
+export GPU_MAX_HW_QUEUES=1
+export OFI_NCCL_USE_IPV6_TCP=1
 
 SCRIPT="train_frontier.py config/train_gpt_neox_20B.py"
 
-# run without profiler
-export WITH_PROFILER=0
-# log start date
-echo start nanoGPT_withoutprof: $(date)
-run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT_withoutprof.log"
-echo $run_cmd
-eval $run_cmd
-# log end date
-echo end nanoGPT_withoutprof: $(date)
-
-
 # run with profiler
 export WITH_PROFILER=1
+OUTPUT_FILE="$JOB_OUTPUT_PATH/output-nanoGPT.log"
 # log start date
-echo start nanoGPT: $(date)
-run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT.log"
-echo $run_cmd
-eval $run_cmd
+echo "start nanoGPT: $(date)" &>> $OUTPUT_FILE
+run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT"
+echo $run_cmd &>> $OUTPUT_FILE
+eval $run_cmd &>> $OUTPUT_FILE
 # log end date
-echo end nanoGPT: $(date)
+echo "end nanoGPT: $(date)" &>> $OUTPUT_FILE
+
+# Run gpu benchmarks
+COMM_TYPE=rccl
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+
+echo "end run: $(date)"
\ No newline at end of file
diff --git a/nanoGPT/train_gpt_neox_5B_frontier.py b/nanoGPT/train_gpt_neox_5B_frontier.py
index 5fcc430..4ce7b55 100644
--- a/nanoGPT/train_gpt_neox_5B_frontier.py
+++ b/nanoGPT/train_gpt_neox_5B_frontier.py
@@ -8,9 +8,9 @@
 
 # these make the total batch size be ~0.5M
 # 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
-batch_size = 32
+batch_size = 16
 block_size = 512
-gradient_accumulation_steps = 1 * 128 #per_gpu x num_gpus
+gradient_accumulation_steps = 2 * 128 #per_gpu x num_gpus
 
 # model
 n_layer = 24