From ca42dbd33517c93b924609833ce4d92fc32b7ad2 Mon Sep 17 00:00:00 2001
From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com>
Date: Wed, 4 Dec 2024 14:46:33 -0500
Subject: [PATCH 01/13] add frontier install instrs for AMG2023

---
 AMG2023/README.md | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/AMG2023/README.md b/AMG2023/README.md
index 476ad56..71af9f6 100644
--- a/AMG2023/README.md
+++ b/AMG2023/README.md
@@ -51,4 +51,47 @@ Repository: [AMG2023](https://github.com/pssg-int/AMG2023)
     ```
 
 ## Frontier Installation
+1. Load modules
+    ```sh
+    module reset
+
+    module load cray-mpich/8.1.28
+    module load craype-accel-amd-gfx90a
+    module load rocm
+    export MPICH_GPU_SUPPORT_ENABLED=1
+
+    # load compatible cmake version
+    module load Core/24.07
+    module load cmake/3.27.9
+    ```
+2. Configure hypre
+    - Clone hypre v2.27.0 and navigate to src: 
+        ```sh
+        git clone -b v2.27.0 https://github.com/hypre-space/hypre.git
+        cd into ~/hypre/src
+        ```
+    - Configure hypre (in hypre/src)
+        ```sh
+        ./configure --with-hip --with-gpu-arch=gfx90a --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" --with-MPI-include="${MPICH_DIR}/include"
+        ```
+    - Compile hypre (in hypre/src)
+        ```sh
+        # build with make
+        make
+        ```
+3. Configure AMG2023
+    - Clone repo: 
+        ```sh
+        git clone https://github.com/pssg-int/AMG2023`
+        cd AMG2023
+        ```
+    - Configure cmake
+        ```sh
+        mkdir build && cd build
 
+        cmake .. -DHYPRE_PREFIX=/ccs/home/keshprad/hypre/src/hypre/ -DCMAKE_EXE_LINKER_FLAGS="-lrocsparse -lrocrand"
+        ```
+    - Compile AMG2023 (in AMG2023/build)
+        ```sh
+        make install
+        ```

From 28abeca38d9f9396e89d5ef2091720a371f5f4f8 Mon Sep 17 00:00:00 2001
From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com>
Date: Tue, 17 Dec 2024 18:06:04 -0500
Subject: [PATCH 02/13] frontier scripts for AMG and for gpu benchmarks

---
 AMG2023/run_frontier_16.sh               | 56 ++++++++++++++++++++++++
 AMG2023/run_frontier_64.sh               | 56 ++++++++++++++++++++++++
 AMG2023/run_frontier_crontab.sh          | 19 ++++++++
 gpu-benchmarks/allgather/run_frontier.sh | 51 +++++++++++++++++++++
 gpu-benchmarks/allreduce/run_frontier.sh | 46 +++++++++++++++++++
 gpu-benchmarks/gemm/run_frontier.sh      | 44 +++++++++++++++++++
 6 files changed, 272 insertions(+)
 create mode 100644 AMG2023/run_frontier_16.sh
 create mode 100644 AMG2023/run_frontier_64.sh
 create mode 100644 AMG2023/run_frontier_crontab.sh
 create mode 100644 gpu-benchmarks/allgather/run_frontier.sh
 create mode 100644 gpu-benchmarks/allreduce/run_frontier.sh
 create mode 100644 gpu-benchmarks/gemm/run_frontier.sh

diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh
new file mode 100644
index 0000000..8546887
--- /dev/null
+++ b/AMG2023/run_frontier_16.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+#SBATCH -N 16
+#SBATCH -n 128
+#SBATCH -q normal
+#SBATCH -J amg
+#SBATCH -t 00:30:00
+#SBATCH -A csc569
+#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/%x-%j/output-AMG2023.log
+#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/%x-%j/error-AMG2023.log
+#SBATCH --exclusive
+# Run like: sbatch run_frontier_16.sh
+
+OUTPUT_DIR=/lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
+OUTPUT_FILE=$OUTPUT_DIR/output-AMG2023.log
+ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log
+
+# Run gpu benchmarks
+COMM_TYPE=mpi
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+
+APP_ROOT=/ccs/home/keshprad/AMG2023
+cd $APP_ROOT
+
+# reset modules
+echo resetting modules:
+module reset
+# load modules
+echo loading modules:
+module load cray-mpich/8.1.28
+module load craype-accel-amd-gfx90a
+module load rocm
+
+export MPICH_GPU_SUPPORT_ENABLED=1
+export CRAY_ACCEL_TARGET=gfx90a
+export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/
+export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so
+export MPIP="-f $OUTPUT_DIR"
+
+# log start date
+echo start AMG2023: $(date)
+# define command
+cmd="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \
+        --output $OUTPUT_FILE \
+        --error $ERROR_FILE \
+        ./build/amg -P 4 4 8 -n 128 64 64 -problem 1 -iter 500"
+echo solving:
+echo $cmd
+$cmd
+# log end date
+echo end AMG2023: $(date)
diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh
new file mode 100644
index 0000000..c28de6a
--- /dev/null
+++ b/AMG2023/run_frontier_64.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+#SBATCH -N 64
+#SBATCH -n 512
+#SBATCH -q normal
+#SBATCH -J amg
+#SBATCH -t 00:30:00
+#SBATCH -A csc569
+#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/%x-%j/output-AMG2023.log
+#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/%x-%j/error-AMG2023.log
+#SBATCH --exclusive
+# Run like: sbatch run_frontier_64.sh
+
+OUTPUT_DIR=/lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
+OUTPUT_FILE=$OUTPUT_DIR/output-AMG2023.log
+ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log
+
+# Run gpu benchmarks
+COMM_TYPE=mpi
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+
+APP_ROOT=/ccs/home/keshprad/AMG2023
+cd $APP_ROOT
+
+# reset modules
+echo resetting modules:
+module reset
+# load modules
+echo loading modules:
+module load cray-mpich/8.1.28
+module load craype-accel-amd-gfx90a
+module load rocm
+
+export MPICH_GPU_SUPPORT_ENABLED=1
+export CRAY_ACCEL_TARGET=gfx90a
+export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/
+export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so
+export MPIP="-f $OUTPUT_DIR"
+
+# log start date
+echo start AMG2023: $(date)
+# define command
+cmd="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \
+        --output $OUTPUT_FILE \
+        --error $ERROR_FILE \
+        ./build/amg -P 8 8 8 -n 128 64 64 -problem 1 -iter 500"
+echo solving:
+echo $cmd
+$cmd
+# log end date
+echo end AMG2023: $(date)
diff --git a/AMG2023/run_frontier_crontab.sh b/AMG2023/run_frontier_crontab.sh
new file mode 100644
index 0000000..09b0f66
--- /dev/null
+++ b/AMG2023/run_frontier_crontab.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <number_of_nodes>"
+    exit 1
+fi
+# `16` or `64`
+NUM_NODES=$1
+
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+
+# load lmod
+source /usr/share/lmod/lmod/init/bash
+# load default LMOD_SYSTEM_DEFAULT_MODULES and MODULEPATH
+export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-trento:craype-network-ofi:perftools-base:xpmem:cray-pmi:PrgEnv-cray:DefApps
+export MODULEPATH=/sw/frontier/spack-envs/modules/cce/17.0.0/cray-mpich-8.1.28/cce-17.0.0:/sw/frontier/spack-envs/modules/cce/17.0.0/cce-17.0.0:/sw/frontier/spack-envs/modules/Core/24.07:/opt/cray/pe/lmod/modulefiles/mpi/crayclang/17.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/crayclang/17.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/compiler/crayclang/17.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-trento/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/sw/frontier/modulefiles:/opt/cray/modulefiles
+
+# run sbatch script
+script=$PERF_VARIABILITY_ROOT/AMG2023/run_frontier_$NUM_NODES\.sh
+sbatch $script
\ No newline at end of file
diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh
new file mode 100644
index 0000000..dfd7bfe
--- /dev/null
+++ b/gpu-benchmarks/allgather/run_frontier.sh
@@ -0,0 +1,51 @@
+# This script assumes it is being run by another sbatch script, 
+# so does not include portions for SBATCH vars (e.g. account, time, etc.)
+
+# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allgather.sh <comm_type> <num_nodes> <output_dir>
+
+#!/bin/bash
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <communication_type> <number_of_nodes> <output_dir>"
+    exit 1
+fi
+# `mpi` or `rccl`
+COMM_TYPE=$1
+# `16` or `64`
+NUM_NODES=$2
+# output directory
+OUTPUT_DIR=$3
+
+OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log
+
+{
+    # reset modules
+    echo resetting modules:
+    module reset
+    # load modules
+    echo loading modules:
+    module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm
+
+    GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks
+    EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x
+    NUM_TASKS=$(($NUM_NODES * 8))
+    MIN_MSG_SIZE=$((1 * 1024))
+    MAX_MSG_SIZE=$((1 * 1024 * 1024))
+    ITERATIONS=100
+
+    export MPICH_GPU_SUPPORT_ENABLED=1
+    export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
+
+    echo start allgather: $(date)
+    For MPI-bench we should use --gpus-per-node --gpus-per-task --ntasks-per-node , and  --gpu-bind=none in srun.
+    CMD="srun -N $NUM_NODES -n $NUM_TASKS \
+            --gpus-per-node 8 \
+            --gpus-per-task 1 \
+            --ntasks-per-node 8 \
+            --gpu-bind none \
+            --output $OUTPUT_FILE \
+            $EXEC $NUM_TASKS $MIN_MSG_SIZE $MAX_MSG_SIZE $ITERATIONS"
+    echo running:
+    echo $CMD
+    $CMD
+    echo end allgather: $(date)
+} >> $OUTPUT_FILE
diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh
new file mode 100644
index 0000000..caafc1a
--- /dev/null
+++ b/gpu-benchmarks/allreduce/run_frontier.sh
@@ -0,0 +1,46 @@
+# This script assumes it is being run by another sbatch script, 
+# so does not include portions for SBATCH vars (e.g. account, time, etc.)
+
+# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allreduce.sh <comm_type> <num_nodes> <output_dir>
+
+#!/bin/bash
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <communication_type> <number_of_nodes> <output_dir>"
+    exit 1
+fi
+# `mpi` or `rccl`
+COMM_TYPE=$1
+# `16` or `64`
+NUM_NODES=$2
+# output directory
+OUTPUT_DIR=$3
+
+OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log
+
+{
+    # reset modules
+    echo resetting modules:
+    module reset
+    # load modules
+    echo loading modules:
+    module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm
+
+    GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks
+    EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x
+    NUM_TASKS=$(($NUM_NODES * 8))
+    MIN_MSG_SIZE=$((1 * 1024))
+    MAX_MSG_SIZE=$((1 * 1024 * 1024))
+    ITERATIONS=100
+
+    export MPICH_GPU_SUPPORT_ENABLED=1
+    export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
+
+    echo start allreduce: $(date)
+    CMD="srun -N $NUM_NODES -n $NUM_TASKS \
+            --output $OUTPUT_FILE \
+            $EXEC $NUM_TASKS $MIN_MSG_SIZE $MAX_MSG_SIZE $ITERATIONS"
+    echo running:
+    echo $CMD
+    $CMD
+    echo end allreduce: $(date)
+} >> $OUTPUT_FILE
diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh
new file mode 100644
index 0000000..6f9bb5b
--- /dev/null
+++ b/gpu-benchmarks/gemm/run_frontier.sh
@@ -0,0 +1,44 @@
+# This script assumes it is being run by another sbatch script, 
+# so does not include portions for SBATCH vars (e.g. account, time, etc.)
+
+# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/gemm.sh <num_nodes> <output_dir>
+
+#!/bin/bash
+if [ "$#" -ne 2 ]; then
+    echo "Usage: $0 <number_of_nodes> <output_dir>"
+    exit 1
+fi
+# `16` or `64`
+NUM_NODES=$1
+# output directory
+OUTPUT_DIR=$2
+
+OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log
+
+{
+    # reset modules
+    echo resetting modules:
+    module reset
+    # load modules
+    echo loading modules:
+    module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm
+
+    GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks
+    EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x
+    NUM_TASKS=$(($NUM_NODES * 8))
+
+    export MPICH_GPU_SUPPORT_ENABLED=1
+    export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
+
+    echo start gemm: $(date)
+    CMD="srun -N $NUM_NODES -n $NUM_TASKS \
+            --gpus-per-node 8 \
+            --gpus-per-task 1 \
+            --ntasks-per-node 8 \
+            --output $OUTPUT_FILE \
+            $EXEC"
+    echo running:
+    echo $CMD
+    $CMD
+    echo end gemm: $(date)
+} >> $OUTPUT_FILE

From d59c821dd5b11603f99e984df12cdb7cf00f8c24 Mon Sep 17 00:00:00 2001
From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com>
Date: Wed, 18 Dec 2024 02:18:39 -0500
Subject: [PATCH 03/13] reformat readme

---
 AMG2023/README.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/AMG2023/README.md b/AMG2023/README.md
index 71af9f6..3e9b90e 100644
--- a/AMG2023/README.md
+++ b/AMG2023/README.md
@@ -1,9 +1,9 @@
 # AMG2023 README
 For more detailed installation parameters, please refer to the [installation document](https://github.com/pssg-int/AMG2023/blob/main/amg-doc.pdf).
 
-## Perlmutter Compilation
+Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/)
 
-Repository: [AMG2023](https://github.com/pssg-int/AMG2023)
+## Perlmutter Compilation
 
 ### Steps to Compile
 
@@ -50,7 +50,10 @@ Repository: [AMG2023](https://github.com/pssg-int/AMG2023)
     cmake -DHYPRE_PREFIX=/pscratch/sd/c/cunyang/AMG2023 ..
     ```
 
-## Frontier Installation
+## Frontier Compilation
+
+### Steps to Compile
+
 1. Load modules
     ```sh
     module reset

From c76505da4eeba3de36130bf3345d88cd4236aaad Mon Sep 17 00:00:00 2001
From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com>
Date: Wed, 25 Dec 2024 01:05:56 -0500
Subject: [PATCH 04/13] update AMG2023 and gpu-benchmarks scripts to use newest
 rocm and cray-mpich versions available on frontier

---
 AMG2023/README.md                        | 15 +++++++++------
 AMG2023/run_frontier_16.sh               | 11 ++++-------
 AMG2023/run_frontier_64.sh               | 10 ++++------
 gpu-benchmarks/allgather/run_frontier.sh |  4 +++-
 gpu-benchmarks/allreduce/run_frontier.sh |  4 +++-
 gpu-benchmarks/gemm/run_frontier.sh      |  4 +++-
 6 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/AMG2023/README.md b/AMG2023/README.md
index 3e9b90e..03832f1 100644
--- a/AMG2023/README.md
+++ b/AMG2023/README.md
@@ -58,24 +58,27 @@ Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/)
     ```sh
     module reset
 
-    module load cray-mpich/8.1.28
+    module load cray-mpich/8.1.30
     module load craype-accel-amd-gfx90a
-    module load rocm
+    module load rocm/6.2.4
     export MPICH_GPU_SUPPORT_ENABLED=1
 
     # load compatible cmake version
     module load Core/24.07
     module load cmake/3.27.9
     ```
-2. Configure hypre
-    - Clone hypre v2.27.0 and navigate to src: 
+2. Configure hypre (v2.32.0)
+    - Clone hypre v2.32.0 and navigate to src: 
         ```sh
-        git clone -b v2.27.0 https://github.com/hypre-space/hypre.git
+        git clone -b v2.32.0 https://github.com/hypre-space/hypre.git
         cd into ~/hypre/src
         ```
     - Configure hypre (in hypre/src)
         ```sh
-        ./configure --with-hip --with-gpu-arch=gfx90a --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" --with-MPI-include="${MPICH_DIR}/include"
+        ./configure --with-hip --enable-device-memory-pool --enable-mixedint --with-gpu-arch=gfx90a \
+	        --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" \
+	        --with-MPI-include="${MPICH_DIR}/include" \
+	        --with-extra-CUFLAGS="-I/opt/rocm-6.2.4/include -I/opt/rocm-6.2.4/include/rocsparse -L/opt/rocm-6.2.4/lib"
         ```
     - Compile hypre (in hypre/src)
         ```sh
diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh
index 8546887..92664c3 100644
--- a/AMG2023/run_frontier_16.sh
+++ b/AMG2023/run_frontier_16.sh
@@ -32,22 +32,19 @@ echo resetting modules:
 module reset
 # load modules
 echo loading modules:
-module load cray-mpich/8.1.28
+module load cray-mpich/8.1.30
 module load craype-accel-amd-gfx90a
-module load rocm
+module load rocm/6.2.4
 
 export MPICH_GPU_SUPPORT_ENABLED=1
 export CRAY_ACCEL_TARGET=gfx90a
 export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/
-export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so
-export MPIP="-f $OUTPUT_DIR"
+export MPIP="-o -f $OUTPUT_DIR"
 
 # log start date
 echo start AMG2023: $(date)
 # define command
-cmd="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \
-        --output $OUTPUT_FILE \
-        --error $ERROR_FILE \
+cmd="srun --output $OUTPUT_FILE --error $ERROR_FILE \
         ./build/amg -P 4 4 8 -n 128 64 64 -problem 1 -iter 500"
 echo solving:
 echo $cmd
diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh
index c28de6a..eb4c6d9 100644
--- a/AMG2023/run_frontier_64.sh
+++ b/AMG2023/run_frontier_64.sh
@@ -32,22 +32,20 @@ echo resetting modules:
 module reset
 # load modules
 echo loading modules:
-module load cray-mpich/8.1.28
+module load cray-mpich/8.1.30
 module load craype-accel-amd-gfx90a
-module load rocm
+module load rocm/6.2.4
 
 export MPICH_GPU_SUPPORT_ENABLED=1
 export CRAY_ACCEL_TARGET=gfx90a
 export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/
 export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so
-export MPIP="-f $OUTPUT_DIR"
+export MPIP="-o -f $OUTPUT_DIR"
 
 # log start date
 echo start AMG2023: $(date)
 # define command
-cmd="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \
-        --output $OUTPUT_FILE \
-        --error $ERROR_FILE \
+cmd="srun --output $OUTPUT_FILE --error $ERROR_FILE \
         ./build/amg -P 8 8 8 -n 128 64 64 -problem 1 -iter 500"
 echo solving:
 echo $cmd
diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh
index dfd7bfe..cb98dd6 100644
--- a/gpu-benchmarks/allgather/run_frontier.sh
+++ b/gpu-benchmarks/allgather/run_frontier.sh
@@ -23,7 +23,9 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log
     module reset
     # load modules
     echo loading modules:
-    module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4
+    module load cray-mpich/8.1.30
+    module load rocm/6.2.4
 
     GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks
     EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x
diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh
index caafc1a..5ac70ea 100644
--- a/gpu-benchmarks/allreduce/run_frontier.sh
+++ b/gpu-benchmarks/allreduce/run_frontier.sh
@@ -23,7 +23,9 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log
     module reset
     # load modules
     echo loading modules:
-    module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4
+    module load cray-mpich/8.1.30
+    module load rocm/6.2.4
 
     GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks
     EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x
diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh
index 6f9bb5b..4ffd5e8 100644
--- a/gpu-benchmarks/gemm/run_frontier.sh
+++ b/gpu-benchmarks/gemm/run_frontier.sh
@@ -21,7 +21,9 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log
     module reset
     # load modules
     echo loading modules:
-    module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 rocm
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4
+    module load cray-mpich/8.1.30
+    module load rocm/6.2.4
 
     GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks
     EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x

From 3d75c0d02d51fa80cf1888a84311c8cb3e5c2a3d Mon Sep 17 00:00:00 2001
From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com>
Date: Wed, 25 Dec 2024 23:54:08 -0500
Subject: [PATCH 05/13] nanogpt scripts

---
 nanoGPT/README.md                      | 73 ++++++++++++++-------
 nanoGPT/run_frontier16.sh              | 90 ++++++++++++++++++++++++++
 nanoGPT/run_frontier64.sh              | 90 ++++++++++++++++++++++++++
 nanoGPT/run_frontier_crontab.sh        | 19 ++++++
 nanoGPT/train_gpt_neox_20B_frontier.py | 46 +++++++++++++
 nanoGPT/train_gpt_neox_5B_frontier.py  | 46 +++++++++++++
 6 files changed, 342 insertions(+), 22 deletions(-)
 create mode 100644 nanoGPT/run_frontier16.sh
 create mode 100644 nanoGPT/run_frontier64.sh
 create mode 100644 nanoGPT/run_frontier_crontab.sh
 create mode 100644 nanoGPT/train_gpt_neox_20B_frontier.py
 create mode 100644 nanoGPT/train_gpt_neox_5B_frontier.py

diff --git a/nanoGPT/README.md b/nanoGPT/README.md
index 5c499fc..87e8189 100644
--- a/nanoGPT/README.md
+++ b/nanoGPT/README.md
@@ -1,33 +1,62 @@
-# nanoGPT Setup Instructions
+# nanoGPT README
+For more detailed installation parameters, please refer to [nanoGPT install guide](https://github.com/axonn-ai/nanoGPT).
 
-## Clone the Repository
+Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/)
 
-```sh
-git clone https://github.com/axonn-ai/nanoGPT.git
-```
 
-## Create Python Environment
+## Perlmutter Setup
 
-```sh
-./scripts/create_python_env_perlmutter.sh
-```
+### Setup steps
 
-> Note: You may need to modify the path and torch version in `create_python_env_perlmutter.sh`.
+1. Clone the Repository
+    ```sh
+    git clone https://github.com/axonn-ai/nanoGPT.git
+    cd nanoGPT
+    ```
 
-## Load PyTorch Module
+2.  Create Python Environment
+    ```sh
+    ./scripts/create_python_env_perlmutter.sh
+    ```
+    > Note: You may need to modify the path and torch version in `create_python_env_perlmutter.sh`.
 
-```sh
-module load pytorch/2.0.1
-```
+3. Load PyTorch Module
+    ```sh
+    module load pytorch/2.0.1
+    ```
 
-## Activate the Environment
+4. Activate the Environment
+    ```sh
+    source path_to_nanogptENV/bin/activate
+    ```
 
-```sh
-source path_to_nanogptENV/bin/activate
-```
+5. Download Data
+    ```sh
+    python nanoGPT/data/openwebtext/prepare.py
+    ```
 
-## Download Data
+## Frontier Setup
 
-```sh
-python nanoGPT/data/openwebtext/prepare.py
-```
\ No newline at end of file
+### Setup steps
+
+1. Clone the Repository
+    ```sh
+    git clone https://github.com/axonn-ai/nanoGPT.git
+    cd nanoGPT
+    ```
+
+2.  Create Python Environment
+    ```sh
+    ./scripts/create_python_env_frontier.sh
+    ```
+    > Note: You may need to modify the WKSPC path and torch version in `create_python_env_frontier.sh`.
+
+4. Activate the Environment
+    ```sh
+    source path_to_nanogptENV/bin/activate
+    ```
+
+5. Download Data
+    ```sh
+    python data/openwebtext/prepare.py
+    ```
\ No newline at end of file
diff --git a/nanoGPT/run_frontier16.sh b/nanoGPT/run_frontier16.sh
new file mode 100644
index 0000000..63718c5
--- /dev/null
+++ b/nanoGPT/run_frontier16.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+#SBATCH -N 16
+#SBATCH -n 128
+#SBATCH -q normal
+#SBATCH -J nanogpt
+#SBATCH -t 01:00:00
+#SBATCH -A csc569
+#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-output.log
+#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-error.log
+#SBATCH --exclusive
+# Run like: sbatch run_frontier16.sh
+
+export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
+OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log
+ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log
+
+# Run gpu benchmarks
+COMM_TYPE=rccl
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+
+APP_ROOT=/lustre/orion/csc569/scratch/keshprad/nanoGPT
+cd $APP_ROOT
+
+export SCRATCH="/lustre/orion/csc569/scratch/keshprad"
+export WRKSPC="${SCRATCH}/nanoGPT"
+export HF_HOME="${SCRATCH}/.cache/hf"
+export HF_TRANSFORMERS_CACHE="${HF_HOME}"
+export HF_DATASETS_CACHE="${HF_HOME}/datasets"
+cd $WRKSPC
+
+# load modules
+rocm_version=6.1.3
+module reset
+module load PrgEnv-gnu/8.5.0
+module load rocm/${rocm_version}
+module load craype-accel-amd-gfx90a
+module load cray-python/3.9.13.1
+module load gcc-native/12.3
+module load cray-mpich/8.1.30
+# activate env
+source ${WRKSPC}/axonn_nanogpt/bin/activate
+
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 8 ))
+## master addr and port
+export MASTER_ADDR=$(hostname -i)
+export MASTER_PORT=3442
+export WORLD_SIZE=${GPUS}
+
+## nccl env vars to speedup stuff
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_NET_GDR_LEVEL=PHB
+export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn0
+export NCCL_NET="AWS Libfabric"
+export NCCL_TIMEOUT=1200
+export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=1200
+export MPICH_GPU_SUPPORT_ENABLED=0
+# AWS-OFI-RCCL
+export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH"
+
+SCRIPT="train_frontier.py config/train_gpt_neox_5B.py"
+
+# run without profiler
+export WITH_PROFILER=0
+# log start date
+echo start nanoGPT_withoutprof: $(date)
+run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT_withoutprof.log"
+echo $run_cmd
+eval $run_cmd
+# log end date
+echo end nanoGPT_withoutprof: $(date)
+
+
+# run with profiler
+export WITH_PROFILER=1
+# log start date
+echo start nanoGPT: $(date)
+run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT.log"
+echo $run_cmd
+eval $run_cmd
+# log end date
+echo end nanoGPT: $(date)
diff --git a/nanoGPT/run_frontier64.sh b/nanoGPT/run_frontier64.sh
new file mode 100644
index 0000000..1c9a75b
--- /dev/null
+++ b/nanoGPT/run_frontier64.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+#SBATCH -N 64
+#SBATCH -n 512
+#SBATCH -q normal
+#SBATCH -J nanogpt
+#SBATCH -t 01:00:00
+#SBATCH -A csc569
+#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-output.log
+#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-error.log
+#SBATCH --exclusive
+# Run like: sbatch run_frontier64.sh
+
+export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
+OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log
+ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log
+
+# Run gpu benchmarks
+COMM_TYPE=rccl
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+
+APP_ROOT=/lustre/orion/csc569/scratch/keshprad/nanoGPT
+cd $APP_ROOT
+
+export SCRATCH="/lustre/orion/csc569/scratch/keshprad"
+export WRKSPC="${SCRATCH}/nanoGPT"
+export HF_HOME="${SCRATCH}/.cache/hf"
+export HF_TRANSFORMERS_CACHE="${HF_HOME}"
+export HF_DATASETS_CACHE="${HF_HOME}/datasets"
+cd $WRKSPC
+
+# load modules
+rocm_version=6.1.3
+module reset
+module load PrgEnv-gnu/8.5.0
+module load rocm/${rocm_version}
+module load craype-accel-amd-gfx90a
+module load cray-python/3.9.13.1
+module load gcc-native/12.3
+module load cray-mpich/8.1.30
+# activate env
+source ${WRKSPC}/axonn_nanogpt/bin/activate
+
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 8 ))
+## master addr and port
+export MASTER_ADDR=$(hostname -i)
+export MASTER_PORT=3442
+export WORLD_SIZE=${GPUS}
+
+## nccl env vars to speedup stuff
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_NET_GDR_LEVEL=PHB
+export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn0
+export NCCL_NET="AWS Libfabric"
+export NCCL_TIMEOUT=1200
+export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=1200
+export MPICH_GPU_SUPPORT_ENABLED=0
+# AWS-OFI-RCCL
+export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH"
+
+SCRIPT="train_frontier.py config/train_gpt_neox_20B.py"
+
+# run without profiler
+export WITH_PROFILER=0
+# log start date
+echo start nanoGPT_withoutprof: $(date)
+run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT_withoutprof.log"
+echo $run_cmd
+eval $run_cmd
+# log end date
+echo end nanoGPT_withoutprof: $(date)
+
+
+# run with profiler
+export WITH_PROFILER=1
+# log start date
+echo start nanoGPT: $(date)
+run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT.log"
+echo $run_cmd
+eval $run_cmd
+# log end date
+echo end nanoGPT: $(date)
diff --git a/nanoGPT/run_frontier_crontab.sh b/nanoGPT/run_frontier_crontab.sh
new file mode 100644
index 0000000..dcc8cf5
--- /dev/null
+++ b/nanoGPT/run_frontier_crontab.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <number_of_nodes>"
+    exit 1
+fi
+# `16` or `64`
+NUM_NODES=$1
+
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+
+# load lmod
+source /usr/share/lmod/lmod/init/bash
+# load default LMOD_SYSTEM_DEFAULT_MODULES and MODULEPATH
+export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-trento:craype-network-ofi:perftools-base:xpmem:cray-pmi:PrgEnv-cray:DefApps
+export MODULEPATH=/sw/frontier/spack-envs/modules/cce/17.0.0/cray-mpich-8.1.28/cce-17.0.0:/sw/frontier/spack-envs/modules/cce/17.0.0/cce-17.0.0:/sw/frontier/spack-envs/modules/Core/24.07:/opt/cray/pe/lmod/modulefiles/mpi/crayclang/17.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/crayclang/17.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/compiler/crayclang/17.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-trento/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/sw/frontier/modulefiles:/opt/cray/modulefiles
+
+# run sbatch script
+script=$PERF_VARIABILITY_ROOT/nanoGPT/run_frontier$NUM_NODES\.sh
+sbatch $script
\ No newline at end of file
diff --git a/nanoGPT/train_gpt_neox_20B_frontier.py b/nanoGPT/train_gpt_neox_20B_frontier.py
new file mode 100644
index 0000000..cf7b91f
--- /dev/null
+++ b/nanoGPT/train_gpt_neox_20B_frontier.py
@@ -0,0 +1,46 @@
+# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB
+# launch as the following (e.g. in a screen session) and wait ~5 days:
+# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py
+
+wandb_log = False
+wandb_project = 'owt'
+wandb_run_name='gpt2-124M'
+
+# these make the total batch size be ~0.5M
+# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
+batch_size = 8
+block_size = 512
+gradient_accumulation_steps = 1 * 512 #per_gpu x num_gpus
+
+# model
+n_layer = 32
+n_head = 56
+n_embd = 7168
+dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
+bias = False # do we use bias inside LayerNorm and Linear layers?
+
+# adamw optimizer
+learning_rate = 1e-4 # max learning rate
+max_iters = 30 # total number of training iterations
+
+# axonn params
+G_intra_d=16
+G_intra_c=1
+G_intra_r=1
+compile=False # disable compile for axonn
+gradient_checkpointing=True
+
+# this makes total number of tokens be 300B
+max_iters = 30
+lr_decay_iters = 600000
+
+# eval stuff
+eval_interval = 1000
+eval_iters = 1
+log_interval = 10
+
+# weight decay
+weight_decay = 1e-1
+
+# log every iteration
+log_interval=1
\ No newline at end of file
diff --git a/nanoGPT/train_gpt_neox_5B_frontier.py b/nanoGPT/train_gpt_neox_5B_frontier.py
new file mode 100644
index 0000000..5fcc430
--- /dev/null
+++ b/nanoGPT/train_gpt_neox_5B_frontier.py
@@ -0,0 +1,46 @@
+# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB
+# launch as the following (e.g. in a screen session) and wait ~5 days:
+# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py
+
+wandb_log = False
+wandb_project = 'owt'
+wandb_run_name='gpt2-124M'
+
+# these make the total batch size be ~0.5M
+# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
+batch_size = 32
+block_size = 512
+gradient_accumulation_steps = 1 * 128 #per_gpu x num_gpus
+
+# model
+n_layer = 24
+n_head = 32
+n_embd = 4096
+dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
+bias = False # do we use bias inside LayerNorm and Linear layers?
+
+# adamw optimizer
+learning_rate = 1e-4 # max learning rate
+max_iters = 30 # total number of training iterations
+
+# axonn params
+G_intra_d=16
+G_intra_c=1
+G_intra_r=1
+compile=False # disable compile for axonn
+gradient_checkpointing=True
+
+# this makes total number of tokens be 300B
+max_iters = 30
+lr_decay_iters = 600000
+
+# eval stuff
+eval_interval = 1000
+eval_iters = 1
+log_interval = 10
+
+# weight decay
+weight_decay = 1e-1
+
+# log every iteration
+log_interval=1
\ No newline at end of file

From 7e8749e901dfdec0a55ad4d6b15a10816cc837a6 Mon Sep 17 00:00:00 2001
From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com>
Date: Fri, 27 Dec 2024 04:45:42 -0500
Subject: [PATCH 06/13] updated AMG2023 and gpu-benchmarks run scripts

---
 AMG2023/README.md                        | 17 ++++++++++++-----
 AMG2023/run_frontier_16.sh               |  4 +++-
 AMG2023/run_frontier_64.sh               |  4 +++-
 gpu-benchmarks/README.md                 | 14 ++++++++++++++
 gpu-benchmarks/allgather/run_frontier.sh |  6 +++---
 gpu-benchmarks/allreduce/run_frontier.sh |  6 +++---
 gpu-benchmarks/gemm/run_frontier.sh      |  6 +++---
 7 files changed, 41 insertions(+), 16 deletions(-)
 create mode 100644 gpu-benchmarks/README.md

diff --git a/AMG2023/README.md b/AMG2023/README.md
index 03832f1..14c75c8 100644
--- a/AMG2023/README.md
+++ b/AMG2023/README.md
@@ -60,7 +60,7 @@ Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/)
 
     module load cray-mpich/8.1.30
     module load craype-accel-amd-gfx90a
-    module load rocm/6.2.4
+    module load rocm/6.1.3
     export MPICH_GPU_SUPPORT_ENABLED=1
 
     # load compatible cmake version
@@ -76,9 +76,10 @@ Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/)
     - Configure hypre (in hypre/src)
         ```sh
         ./configure --with-hip --enable-device-memory-pool --enable-mixedint --with-gpu-arch=gfx90a \
-	        --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" \
-	        --with-MPI-include="${MPICH_DIR}/include" \
-	        --with-extra-CUFLAGS="-I/opt/rocm-6.2.4/include -I/opt/rocm-6.2.4/include/rocsparse -L/opt/rocm-6.2.4/lib"
+            --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" \
+            --with-MPI-include="${MPICH_DIR}/include" \
+            CFLAGS="-I${ROCM_PATH}/include/ -I${ROCM_PATH}/llvm/include/ -I${ROCM_PATH}/include/rocsparse/" \
+            LDFLAGS="-L${ROCM_PATH}/lib/ -L${ROCM_PATH}/llvm/lib/ -lrocsparse"
         ```
     - Compile hypre (in hypre/src)
         ```sh
@@ -91,11 +92,17 @@ Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/)
         git clone https://github.com/pssg-int/AMG2023`
         cd AMG2023
         ```
+    - Add mpiP to LD_LIBRARY_PATH
+        ```sh
+        export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH
+        ```
     - Configure cmake
         ```sh
         mkdir build && cd build
 
-        cmake .. -DHYPRE_PREFIX=/ccs/home/keshprad/hypre/src/hypre/ -DCMAKE_EXE_LINKER_FLAGS="-lrocsparse -lrocrand"
+        cmake .. -DHYPRE_PREFIX=/ccs/home/keshprad/hypre/src/hypre/ \
+            -DCMAKE_C_FLAGS="-I${ROCM_PATH}/include/ -I${ROCM_PATH}/llvm/include/ -I${ROCM_PATH}/include/rocsparse/" \
+            -DCMAKE_EXE_LINKER_FLAGS="-L${ROCM_PATH}/lib/ -L${ROCM_PATH}/llvm/lib/ -lrocsparse -lrocrand"
         ```
     - Compile AMG2023 (in AMG2023/build)
         ```sh
diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh
index 92664c3..d635c31 100644
--- a/AMG2023/run_frontier_16.sh
+++ b/AMG2023/run_frontier_16.sh
@@ -34,11 +34,13 @@ module reset
 echo loading modules:
 module load cray-mpich/8.1.30
 module load craype-accel-amd-gfx90a
-module load rocm/6.2.4
+module load rocm/6.1.3
 
 export MPICH_GPU_SUPPORT_ENABLED=1
 export CRAY_ACCEL_TARGET=gfx90a
 export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/
+# mpiP
+export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH
 export MPIP="-o -f $OUTPUT_DIR"
 
 # log start date
diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh
index eb4c6d9..8854ca1 100644
--- a/AMG2023/run_frontier_64.sh
+++ b/AMG2023/run_frontier_64.sh
@@ -34,12 +34,14 @@ module reset
 echo loading modules:
 module load cray-mpich/8.1.30
 module load craype-accel-amd-gfx90a
-module load rocm/6.2.4
+module load rocm/6.1.3
 
 export MPICH_GPU_SUPPORT_ENABLED=1
 export CRAY_ACCEL_TARGET=gfx90a
 export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/
 export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so
+# mpiP
+export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH
 export MPIP="-o -f $OUTPUT_DIR"
 
 # log start date
diff --git a/gpu-benchmarks/README.md b/gpu-benchmarks/README.md
new file mode 100644
index 0000000..c8f9c25
--- /dev/null
+++ b/gpu-benchmarks/README.md
@@ -0,0 +1,14 @@
+# gpu-benchmarks README
+Code Repository: [gpu-benchmarks](#TODO:)
+
+## Perlmutter Compilation
+
+### Steps to Compile
+
+TODO:
+
+## Frontier Compilation
+
+### Steps to Compile
+
+TODO:
\ No newline at end of file
diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh
index cb98dd6..75216e8 100644
--- a/gpu-benchmarks/allgather/run_frontier.sh
+++ b/gpu-benchmarks/allgather/run_frontier.sh
@@ -23,11 +23,11 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log
     module reset
     # load modules
     echo loading modules:
-    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3
     module load cray-mpich/8.1.30
-    module load rocm/6.2.4
+    module load rocm/6.1.3
 
-    GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks
+    GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
     EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x
     NUM_TASKS=$(($NUM_NODES * 8))
     MIN_MSG_SIZE=$((1 * 1024))
diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh
index 5ac70ea..729c539 100644
--- a/gpu-benchmarks/allreduce/run_frontier.sh
+++ b/gpu-benchmarks/allreduce/run_frontier.sh
@@ -23,11 +23,11 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log
     module reset
     # load modules
     echo loading modules:
-    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3
     module load cray-mpich/8.1.30
-    module load rocm/6.2.4
+    module load rocm/6.1.3
 
-    GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks
+    GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
     EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x
     NUM_TASKS=$(($NUM_NODES * 8))
     MIN_MSG_SIZE=$((1 * 1024))
diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh
index 4ffd5e8..d089dd1 100644
--- a/gpu-benchmarks/gemm/run_frontier.sh
+++ b/gpu-benchmarks/gemm/run_frontier.sh
@@ -21,11 +21,11 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log
     module reset
     # load modules
     echo loading modules:
-    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.2.4
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3
     module load cray-mpich/8.1.30
-    module load rocm/6.2.4
+    module load rocm/6.1.3
 
-    GPU_BENCHMARKS_ROOT=/ccs/home/keshprad/gpu-benchmarks
+    GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
     EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x
     NUM_TASKS=$(($NUM_NODES * 8))
 

From 6e394a280e740dbf3781ef6e822a411cb40a6912 Mon Sep 17 00:00:00 2001
From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com>
Date: Fri, 27 Dec 2024 16:06:48 -0500
Subject: [PATCH 07/13] DeepCAM scripts and crontab for frontier

---
 DeepCAM/README.md               | 131 +++++++++++++++++++++++++++++++
 DeepCAM/run_frontier_16.sh      | 132 ++++++++++++++++++++++++++++++++
 DeepCAM/run_frontier_64.sh      | 132 ++++++++++++++++++++++++++++++++
 DeepCAM/run_frontier_crontab.sh |  19 +++++
 4 files changed, 414 insertions(+)
 create mode 100644 DeepCAM/README.md
 create mode 100644 DeepCAM/run_frontier_16.sh
 create mode 100644 DeepCAM/run_frontier_64.sh
 create mode 100644 DeepCAM/run_frontier_crontab.sh

diff --git a/DeepCAM/README.md b/DeepCAM/README.md
new file mode 100644
index 0000000..94e6880
--- /dev/null
+++ b/DeepCAM/README.md
@@ -0,0 +1,131 @@
+# DeepCAM README
+For more detailed installation parameters, please refer to DeepCAM install guide
+
+Perlmutter Repository: [hpc_results_v3.0](https://github.com/hpcgroup/hpc_results_v3.0)  
+Frontier Repository: [hpc](https://github.com/hpcgroup/hpc)
+
+
+## Perlmutter Setup
+
+### Setup steps
+
+## Frontier Setup
+
+### Setup steps
+
+#### 1. Pytorch Install
+- Load modules
+    ```bash
+    module reset
+    module load PrgEnv-gnu/8.5.0
+    module load rocm/6.1.3
+    module load craype-accel-amd-gfx90a
+    module load cray-python/3.9.13.1
+- Create env variables
+    ```bash
+    DEEPCAM_ROOT=/lustre/orion/csc569/scratch/keshprad/deepcam/
+    PYVENV_ROOT=${DEEPCAM_ROOT}/.venv
+    PYVENV_SITEPKGS=${PYVENV_ROOT}/lib/python3.9/site-packages
+
+    cd ${DEEPCAM_ROOT}
+    ```
+- Create python virtual env
+    ```bash
+    python -m venv ${PYVENV_ROOT}
+    source ${PYVENV_ROOT}/bin/activate
+    ```
+- Install torch and mpi4py
+    ```bash
+    # torch==2.5.0
+    pip install torch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/rocm6.1
+
+    MPICC="cc -shared" pip install --no-cache-dir --no-binary=mpi4py mpi4py
+    ```
+- Install AWS-OCI-RCCL plugin
+    ```bash
+    mkdir -p ${DEEPCAM_ROOT}/repos
+    cd ${DEEPCAM_ROOT}/repos
+
+    rocm_version=6.1.3
+    # Load modules
+    module load PrgEnv-gnu/8.5.0
+    module load rocm/$rocm_version
+    module load craype-accel-amd-gfx90a
+    module load gcc-native/12.3
+    module load cray-mpich/8.1.30
+    #module load libtool
+    libfabric_path=/opt/cray/libfabric/1.15.2.0
+
+    # Download the plugin repo
+    git clone --recursive https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl
+    cd aws-ofi-rccl
+
+    # Build the plugin
+    ./autogen.sh
+    export LD_LIBRARY_PATH=/opt/rocm-$rocm_version/hip/lib:$LD_LIBRARY_PATH
+    PLUG_PREFIX=$PWD
+
+    CC=hipcc CFLAGS=-I/opt/rocm-$rocm_version/rccl/include ./configure \
+    --with-libfabric=$libfabric_path --with-rccl=/opt/rocm-$rocm_version --enable-trace \
+    --prefix=$PLUG_PREFIX --with-hip=/opt/rocm-$rocm_version/hip --with-mpi=$MPICH_DIR
+
+    make
+    make install
+
+    # Reminder to export the plugin to your path
+    echo $PLUG_PREFIX
+    echo "Add the following line in the environment to use the AWS OFI RCCL plugin"
+    echo "export LD_LIBRARY_PATH="$PLUG_PREFIX"/lib:$""LD_LIBRARY_PATH"
+    ```
+- Install supporting dependencies
+    ```bash
+    cd ${DEEPCAM_ROOT}
+
+    pip install wandb
+    pip install gym
+    pip install pyspark
+    pip install scikit-learn
+    pip install scikit-image
+    pip install opencv-python
+    pip install wheel
+    pip install tomli
+    pip install h5py
+
+    # tensorboard
+    pip install tensorboard
+    pip install tensorboard_plugin_profile
+    pip install tensorboard-plugin-wit
+    pip install tensorboard-pytorch
+
+    pip install git+https://github.com/ildoonet/pytorch-gradual-warmup-lr.git
+    ```
+- Install mlperf-logging
+    ```bash
+    mkdir -p ${DEEPCAM_ROOT}/repos
+    cd ${DEEPCAM_ROOT}/repos
+
+    git clone -b hpc-1.0-branch https://github.com/mlcommons/logging mlperf-logging
+    # may need to manually change mlperf-logging/VERSION to a valid version number (e.g. 1.0.0.rc2)
+    pip install -e mlperf-logging
+
+    rm ${PYVENV_SITEPKGS}/mlperf-logging.egg-link
+    cp -r ./mlperf-logging/mlperf_logging ${PYVENV_SITEPKGS}/mlperf_logging
+    cp -r ./mlperf-logging/mlperf_logging.egg-info ${PYVENV_SITEPKGS}/mlperf_logging.egg-info
+    ```
+
+#### 2. Download src code
+- Download from PSSG Frontier repo for DeepCAM (linked at top of README)
+    ```bash
+    # REPLACE WITH YOUR PATH
+    PRFX=/lustre/orion/csc569/scratch/keshprad
+    DEEPCAM_ROOT=${PRFX}/deepcam
+
+    mkdir -p ${DEEPCAM_ROOT}
+    cd ${DEEPCAM_ROOT}
+
+    git clone https://github.com/hpcgroup/hpc.git hpc
+    ```
+
+#### 3. Download dataset with globus
+- [Globus Link](https://app.globus.org/file-manager?origin_id=0b226e2c-4de0-11ea-971a-021304b0cca7&origin_path=%2F)
+    - Download to `$DEEPCAM_ROOT/data`
\ No newline at end of file
diff --git a/DeepCAM/run_frontier_16.sh b/DeepCAM/run_frontier_16.sh
new file mode 100644
index 0000000..14cef72
--- /dev/null
+++ b/DeepCAM/run_frontier_16.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+#SBATCH -N 16
+#SBATCH -n 128
+#SBATCH -q normal
+#SBATCH -J deepcam
+#SBATCH -t 01:30:00
+#SBATCH -A csc569
+#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/16nodes/%x-%j/job-output.log
+#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/16nodes/%x-%j/job-error.log
+#SBATCH --exclusive
+# Run like: sbatch run_frontier_16.sh
+
+echo "start run: $(date)"
+export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
+OUTPUT_FILE=${JOB_OUTPUT_PATH}/output-deepcam.log
+ERROR_FILE=${JOB_OUTPUT_PATH}/error-deepcam.log
+
+# Run gpu benchmarks
+COMM_TYPE=rccl
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+
+APP_ROOT=/lustre/orion/csc569/scratch/keshprad/deepcam
+APP_WORKING_DIR=${APP_ROOT}/hpc/deepcam/src/deepCam
+cd $APP_WORKING_DIR
+
+# reset modules
+echo resetting modules:
+module reset
+# load modules
+echo loading modules:
+module load PrgEnv-gnu/8.5.0
+module load rocm/6.1.3
+module load craype-accel-amd-gfx90a
+module load cray-python/3.9.13.1
+module load ums/default
+module load ums002/default
+module load cray-hdf5-parallel/1.12.2.1
+
+# activate virtual env
+echo activating virtual env:
+source ${APP_ROOT}/.venv/bin/activate
+
+# ENV variables
+echo setting env vars:
+mkdir -p ${JOB_OUTPUT_PATH}
+export OMP_NUM_THREADS=1
+export RUN_TAG="${SLURM_JOB_NAME}-${SLURM_JOB_ID}"
+export MASTER_ADDR=$(hostname -i)
+export MASTER_PORT=3442
+export NCCL_SOCKET_IFNAME=hsn0
+
+# Needed to bypass MIOpen, Disk I/O Errors
+export MIOPEN_USER_DB_PATH="/tmp/my-miopen-cache-${SLURM_JOB_ID}"
+export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH}
+
+# Add AWS-OFI-RCCL
+export LD_LIBRARY_PATH=${APP_ROOT}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH
+
+BENCH_RCP_FIXED="\
+    --gradient_accumulation_frequency 1 \
+    --logging_frequency 10 \
+    --save_frequency 0 \
+    --seed $(date +%s) \
+    --batchnorm_group_size 1 \
+    --target_iou 0.80"
+
+#BENCH_RCP_BASELINE_LR describes the learning rate for Baseline runs.
+#It should not be modified.
+BENCH_RCP_BASELINE_LR="\
+    --start_lr 0.0055 \
+    --lr_schedule type="multistep",milestones="800",decay_rate="0.1" \
+    --lr_warmup_steps 400 \
+    --lr_warmup_factor 1. \
+    --weight_decay 1e-2 \
+    --optimizer_betas 0.9 0.999"
+
+BENCH_RCP_BASELINE="\
+    ${BENCH_RCP_FIXED} \
+    ${BENCH_RCP_BASELINE_LR}"
+
+# define command
+MAX_EPOCHS=2
+cmd="srun --export=ALL --tasks-per-node=8 --gpus-per-node=8 \
+        --gpu-bind=closest --gpus-per-task=1 \
+        --cpu-bind=none --hint=nomultithread \
+        python train.py \
+            ${BENCH_RCP_BASELINE} \
+            --data_dir_prefix ${APP_ROOT}/data/All-Hist \
+            --run_tag ${RUN_TAG} \
+            --output_dir ${JOB_OUTPUT_PATH} \
+            --wireup_method nccl-slurm \
+            --max_epochs ${MAX_EPOCHS} \
+            --optimizer "Adam" \
+            --local_batch_size 2"
+
+# run without profiler
+export WITH_PROFILER=0
+OUTPUT_FILE="$JOB_OUTPUT_PATH/output-deepcam_withoutprof.log"
+# clear cache
+rm -rf ${MIOPEN_USER_DB_PATH}
+mkdir -p ${MIOPEN_USER_DB_PATH}
+# log start date
+echo "start deepcam_withoutprof: $(date)" &>> $OUTPUT_FILE
+# execute command
+echo $cmd &>> $OUTPUT_FILE
+eval $cmd &>> $OUTPUT_FILE
+# log end date
+echo "end deepcam_withoutprof: $(date)" &>> $OUTPUT_FILE
+
+
+# run with profiler
+export WITH_PROFILER=1
+OUTPUT_FILE="$JOB_OUTPUT_PATH/output-deepcam.log"
+# clear cache
+rm -rf ${MIOPEN_USER_DB_PATH}
+mkdir -p ${MIOPEN_USER_DB_PATH}
+# log start date
+echo "start deepcam: $(date)" &>> $OUTPUT_FILE
+# execute command
+echo $cmd &>> $OUTPUT_FILE
+eval $cmd &>> $OUTPUT_FILE
+# log end date
+echo "end deepcam: $(date)" &>> $OUTPUT_FILE
+
+rm -rf ${MIOPEN_USER_DB_PATH}
+echo "end run: $(date)"
\ No newline at end of file
diff --git a/DeepCAM/run_frontier_64.sh b/DeepCAM/run_frontier_64.sh
new file mode 100644
index 0000000..48e7059
--- /dev/null
+++ b/DeepCAM/run_frontier_64.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+#SBATCH -N 64
+#SBATCH -n 512
+#SBATCH -q normal
+#SBATCH -J deepcam
+#SBATCH -t 01:30:00
+#SBATCH -A csc569
+#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/64nodes/%x-%j/job-output.log
+#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/64nodes/%x-%j/job-error.log
+#SBATCH --exclusive
+# Run like: sbatch run_frontier_64.sh
+
+echo "start run: $(date)"
+export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
+OUTPUT_FILE=${JOB_OUTPUT_PATH}/output-deepcam.log
+ERROR_FILE=${JOB_OUTPUT_PATH}/error-deepcam.log
+
+# Run gpu benchmarks
+COMM_TYPE=rccl
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+
+APP_ROOT=/lustre/orion/csc569/scratch/keshprad/deepcam
+APP_WORKING_DIR=${APP_ROOT}/hpc/deepcam/src/deepCam
+cd $APP_WORKING_DIR
+
+# reset modules
+echo resetting modules:
+module reset
+# load modules
+echo loading modules:
+module load PrgEnv-gnu/8.5.0
+module load rocm/6.1.3
+module load craype-accel-amd-gfx90a
+module load cray-python/3.9.13.1
+module load ums/default
+module load ums002/default
+module load cray-hdf5-parallel/1.12.2.1
+
+# activate virtual env
+echo activating virtual env:
+source ${APP_ROOT}/.venv/bin/activate
+
+# ENV variables
+echo setting env vars:
+mkdir -p ${JOB_OUTPUT_PATH}
+export OMP_NUM_THREADS=1
+export RUN_TAG="${SLURM_JOB_NAME}-${SLURM_JOB_ID}"
+export MASTER_ADDR=$(hostname -i)
+export MASTER_PORT=3442
+export NCCL_SOCKET_IFNAME=hsn0
+
+# Needed to bypass MIOpen, Disk I/O Errors
+export MIOPEN_USER_DB_PATH="/tmp/my-miopen-cache-${SLURM_JOB_ID}"
+export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH}
+
+# Add AWS-OFI-RCCL
+export LD_LIBRARY_PATH=${APP_ROOT}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH
+
+BENCH_RCP_FIXED="\
+    --gradient_accumulation_frequency 1 \
+    --logging_frequency 10 \
+    --save_frequency 0 \
+    --seed $(date +%s) \
+    --batchnorm_group_size 1 \
+    --target_iou 0.80"
+
+#BENCH_RCP_BASELINE_LR describes the learning rate for Baseline runs.
+#It should not be modified.
+BENCH_RCP_BASELINE_LR="\
+    --start_lr 0.0055 \
+    --lr_schedule type="multistep",milestones="800",decay_rate="0.1" \
+    --lr_warmup_steps 400 \
+    --lr_warmup_factor 1. \
+    --weight_decay 1e-2 \
+    --optimizer_betas 0.9 0.999"
+
+BENCH_RCP_BASELINE="\
+    ${BENCH_RCP_FIXED} \
+    ${BENCH_RCP_BASELINE_LR}"
+
+# define command
+MAX_EPOCHS=8
+cmd="srun --export=ALL --tasks-per-node=8 --gpus-per-node=8 \
+        --gpu-bind=closest --gpus-per-task=1 \
+        --cpu-bind=none --hint=nomultithread \
+        python train.py \
+            ${BENCH_RCP_BASELINE} \
+            --data_dir_prefix ${APP_ROOT}/data/All-Hist \
+            --run_tag ${RUN_TAG} \
+            --output_dir ${JOB_OUTPUT_PATH} \
+            --wireup_method nccl-slurm \
+            --max_epochs ${MAX_EPOCHS} \
+            --optimizer "Adam" \
+            --local_batch_size 2"
+
+# run without profiler
+export WITH_PROFILER=0
+OUTPUT_FILE="$JOB_OUTPUT_PATH/output-deepcam_withoutprof.log"
+# clear cache
+rm -rf ${MIOPEN_USER_DB_PATH}
+mkdir -p ${MIOPEN_USER_DB_PATH}
+# log start date
+echo "start deepcam_withoutprof: $(date)" &>> $OUTPUT_FILE
+# execute command
+echo $cmd &>> $OUTPUT_FILE
+eval $cmd &>> $OUTPUT_FILE
+# log end date
+echo "end deepcam_withoutprof: $(date)" &>> $OUTPUT_FILE
+
+
+# run with profiler
+export WITH_PROFILER=1
+OUTPUT_FILE="$JOB_OUTPUT_PATH/output-deepcam.log"
+# clear cache
+rm -rf ${MIOPEN_USER_DB_PATH}
+mkdir -p ${MIOPEN_USER_DB_PATH}
+# log start date
+echo "start deepcam: $(date)" &>> $OUTPUT_FILE
+# execute command
+echo $cmd &>> $OUTPUT_FILE
+eval $cmd &>> $OUTPUT_FILE
+# log end date
+echo "end deepcam: $(date)" &>> $OUTPUT_FILE
+
+rm -rf ${MIOPEN_USER_DB_PATH}
+echo "end run: $(date)"
\ No newline at end of file
diff --git a/DeepCAM/run_frontier_crontab.sh b/DeepCAM/run_frontier_crontab.sh
new file mode 100644
index 0000000..6d70161
--- /dev/null
+++ b/DeepCAM/run_frontier_crontab.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <number_of_nodes>"
+    exit 1
+fi
+# `16` or `64`
+NUM_NODES=$1
+
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+
+# load lmod
+source /usr/share/lmod/lmod/init/bash
+# load default LMOD_SYSTEM_DEFAULT_MODULES and MODULEPATH
+export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-trento:craype-network-ofi:perftools-base:xpmem:cray-pmi:PrgEnv-cray:DefApps
+export MODULEPATH=/sw/frontier/spack-envs/modules/cce/17.0.0/cray-mpich-8.1.28/cce-17.0.0:/sw/frontier/spack-envs/modules/cce/17.0.0/cce-17.0.0:/sw/frontier/spack-envs/modules/Core/24.07:/opt/cray/pe/lmod/modulefiles/mpi/crayclang/17.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/crayclang/17.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/compiler/crayclang/17.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-trento/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/sw/frontier/modulefiles:/opt/cray/modulefiles
+
+# run sbatch script
+script=$PERF_VARIABILITY_ROOT/DeepCAM/run_frontier_$NUM_NODES\.sh
+sbatch $script
\ No newline at end of file

From 56e0fd5a4b4f47b807fe6db75cfeb1ab4c5476a9 Mon Sep 17 00:00:00 2001
From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com>
Date: Sat, 28 Dec 2024 21:56:25 -0500
Subject: [PATCH 08/13] use gpu-bind=none for frontier

---
 AMG2023/run_frontier_16.sh               | 1 +
 AMG2023/run_frontier_64.sh               | 2 +-
 gpu-benchmarks/allgather/run_frontier.sh | 3 ++-
 gpu-benchmarks/allreduce/run_frontier.sh | 3 ++-
 gpu-benchmarks/gemm/run_frontier.sh      | 3 ++-
 5 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh
index d635c31..c0a69b0 100644
--- a/AMG2023/run_frontier_16.sh
+++ b/AMG2023/run_frontier_16.sh
@@ -3,6 +3,7 @@
 #SBATCH -n 128
 #SBATCH -q normal
 #SBATCH -J amg
+#SBATCH --gpu-bind none
 #SBATCH -t 00:30:00
 #SBATCH -A csc569
 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/%x-%j/output-AMG2023.log
diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh
index 8854ca1..8baabe8 100644
--- a/AMG2023/run_frontier_64.sh
+++ b/AMG2023/run_frontier_64.sh
@@ -3,6 +3,7 @@
 #SBATCH -n 512
 #SBATCH -q normal
 #SBATCH -J amg
+#SBATCH --gpu-bind none
 #SBATCH -t 00:30:00
 #SBATCH -A csc569
 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/%x-%j/output-AMG2023.log
@@ -39,7 +40,6 @@ module load rocm/6.1.3
 export MPICH_GPU_SUPPORT_ENABLED=1
 export CRAY_ACCEL_TARGET=gfx90a
 export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/
-export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP/libmpiP.so
 # mpiP
 export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH
 export MPIP="-o -f $OUTPUT_DIR"
diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh
index 75216e8..79cedc7 100644
--- a/gpu-benchmarks/allgather/run_frontier.sh
+++ b/gpu-benchmarks/allgather/run_frontier.sh
@@ -26,6 +26,7 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log
     module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3
     module load cray-mpich/8.1.30
     module load rocm/6.1.3
+    module list
 
     GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
     EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x
@@ -50,4 +51,4 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log
     echo $CMD
     $CMD
     echo end allgather: $(date)
-} >> $OUTPUT_FILE
+} &>> $OUTPUT_FILE
diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh
index 729c539..56bd2fe 100644
--- a/gpu-benchmarks/allreduce/run_frontier.sh
+++ b/gpu-benchmarks/allreduce/run_frontier.sh
@@ -26,6 +26,7 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log
     module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3
     module load cray-mpich/8.1.30
     module load rocm/6.1.3
+    module list
 
     GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
     EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x
@@ -45,4 +46,4 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log
     echo $CMD
     $CMD
     echo end allreduce: $(date)
-} >> $OUTPUT_FILE
+} &>> $OUTPUT_FILE
diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh
index d089dd1..9ccecbd 100644
--- a/gpu-benchmarks/gemm/run_frontier.sh
+++ b/gpu-benchmarks/gemm/run_frontier.sh
@@ -24,6 +24,7 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log
     module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3
     module load cray-mpich/8.1.30
     module load rocm/6.1.3
+    module list
 
     GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
     EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x
@@ -43,4 +44,4 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log
     echo $CMD
     $CMD
     echo end gemm: $(date)
-} >> $OUTPUT_FILE
+} &>> $OUTPUT_FILE

From 175ee55cbf8adfb0ce633236d55dbb80e1064b7a Mon Sep 17 00:00:00 2001
From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com>
Date: Sat, 28 Dec 2024 21:59:15 -0500
Subject: [PATCH 09/13] use gpu-bind=none for deepcam on frontier

---
 DeepCAM/run_frontier_16.sh | 7 +++----
 DeepCAM/run_frontier_64.sh | 7 +++----
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/DeepCAM/run_frontier_16.sh b/DeepCAM/run_frontier_16.sh
index 14cef72..e87f6e9 100644
--- a/DeepCAM/run_frontier_16.sh
+++ b/DeepCAM/run_frontier_16.sh
@@ -3,7 +3,8 @@
 #SBATCH -n 128
 #SBATCH -q normal
 #SBATCH -J deepcam
-#SBATCH -t 01:30:00
+#SBATCH --gpu-bind none
+#SBATCH -t 01:00:00
 #SBATCH -A csc569
 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/16nodes/%x-%j/job-output.log
 #SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/16nodes/%x-%j/job-error.log
@@ -38,9 +39,7 @@ module load PrgEnv-gnu/8.5.0
 module load rocm/6.1.3
 module load craype-accel-amd-gfx90a
 module load cray-python/3.9.13.1
-module load ums/default
-module load ums002/default
-module load cray-hdf5-parallel/1.12.2.1
+module list
 
 # activate virtual env
 echo activating virtual env:
diff --git a/DeepCAM/run_frontier_64.sh b/DeepCAM/run_frontier_64.sh
index 48e7059..fbe0b29 100644
--- a/DeepCAM/run_frontier_64.sh
+++ b/DeepCAM/run_frontier_64.sh
@@ -3,7 +3,8 @@
 #SBATCH -n 512
 #SBATCH -q normal
 #SBATCH -J deepcam
-#SBATCH -t 01:30:00
+#SBATCH --gpu-bind none
+#SBATCH -t 01:00:00
 #SBATCH -A csc569
 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/64nodes/%x-%j/job-output.log
 #SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/64nodes/%x-%j/job-error.log
@@ -38,9 +39,7 @@ module load PrgEnv-gnu/8.5.0
 module load rocm/6.1.3
 module load craype-accel-amd-gfx90a
 module load cray-python/3.9.13.1
-module load ums/default
-module load ums002/default
-module load cray-hdf5-parallel/1.12.2.1
+module list
 
 # activate virtual env
 echo activating virtual env:

From a087255bc2a22e152a5508f3103c88bfc7847ebc Mon Sep 17 00:00:00 2001
From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com>
Date: Mon, 13 Jan 2025 17:28:22 -0500
Subject: [PATCH 10/13] update gpu-benchmarks to specify ROCM version

---
 AMG2023/run_frontier_16.sh               |  7 ++++---
 AMG2023/run_frontier_64.sh               |  7 ++++---
 gpu-benchmarks/allgather/run_frontier.sh | 25 ++++++++++++++++--------
 gpu-benchmarks/allreduce/run_frontier.sh | 25 ++++++++++++++++--------
 gpu-benchmarks/gemm/run_frontier.sh      | 25 ++++++++++++++++--------
 5 files changed, 59 insertions(+), 30 deletions(-)

diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh
index c0a69b0..c51b52d 100644
--- a/AMG2023/run_frontier_16.sh
+++ b/AMG2023/run_frontier_16.sh
@@ -17,13 +17,14 @@ ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log
 
 # Run gpu benchmarks
 COMM_TYPE=mpi
+ROCM_VERSION=6.1.3
 PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
 echo running allreduce benchmark
-bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
 # echo running allgather benchmark
-# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
 echo running gemm benchmark
-bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
 
 APP_ROOT=/ccs/home/keshprad/AMG2023
 cd $APP_ROOT
diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh
index 8baabe8..c7a7a3e 100644
--- a/AMG2023/run_frontier_64.sh
+++ b/AMG2023/run_frontier_64.sh
@@ -17,13 +17,14 @@ ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log
 
 # Run gpu benchmarks
 COMM_TYPE=mpi
+ROCM_VERSION=6.1.3
 PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
 echo running allreduce benchmark
-bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
 # echo running allgather benchmark
-# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
 echo running gemm benchmark
-bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
 
 APP_ROOT=/ccs/home/keshprad/AMG2023
 cd $APP_ROOT
diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh
index 79cedc7..7fc10b4 100644
--- a/gpu-benchmarks/allgather/run_frontier.sh
+++ b/gpu-benchmarks/allgather/run_frontier.sh
@@ -4,16 +4,25 @@
 # run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allgather.sh <comm_type> <num_nodes> <output_dir>
 
 #!/bin/bash
-if [ "$#" -ne 3 ]; then
-    echo "Usage: $0 <communication_type> <number_of_nodes> <output_dir>"
+if [ "$#" -ne 4 ]; then
+    echo "Usage: $0 <communication_type> <rocm_version> <number_of_nodes> <output_dir>"
     exit 1
 fi
 # `mpi` or `rccl`
 COMM_TYPE=$1
+# `5.7.1` or `6.1.3`
+ROCM_VERSION=$2
 # `16` or `64`
-NUM_NODES=$2
+NUM_NODES=$3
 # output directory
-OUTPUT_DIR=$3
+OUTPUT_DIR=$4
+
+# setup cray-mpich version
+if [[ "$ROCM_VERSION" == "6.1.3" ]]; then
+    MPICH_VERSION=8.1.30
+else
+    MPICH_VERSION=8.1.28
+fi
 
 OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log
 
@@ -23,13 +32,13 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log
     module reset
     # load modules
     echo loading modules:
-    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3
-    module load cray-mpich/8.1.30
-    module load rocm/6.1.3
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION}
+    module load cray-mpich/${MPICH_VERSION}
+    module load rocm/${ROCM_VERSION}
     module list
 
     GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
-    EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE.x
+    EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE\_rocm-${ROCM_VERSION}.x
     NUM_TASKS=$(($NUM_NODES * 8))
     MIN_MSG_SIZE=$((1 * 1024))
     MAX_MSG_SIZE=$((1 * 1024 * 1024))
diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh
index 56bd2fe..855a486 100644
--- a/gpu-benchmarks/allreduce/run_frontier.sh
+++ b/gpu-benchmarks/allreduce/run_frontier.sh
@@ -4,16 +4,25 @@
 # run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allreduce.sh <comm_type> <num_nodes> <output_dir>
 
 #!/bin/bash
-if [ "$#" -ne 3 ]; then
-    echo "Usage: $0 <communication_type> <number_of_nodes> <output_dir>"
+if [ "$#" -ne 4 ]; then
+    echo "Usage: $0 <communication_type> <rocm_version> <number_of_nodes> <output_dir>"
     exit 1
 fi
 # `mpi` or `rccl`
 COMM_TYPE=$1
+# `5.7.1` or `6.1.3`
+ROCM_VERSION=$2
 # `16` or `64`
-NUM_NODES=$2
+NUM_NODES=$3
 # output directory
-OUTPUT_DIR=$3
+OUTPUT_DIR=$4
+
+# setup cray-mpich version
+if [[ "$ROCM_VERSION" == "6.1.3" ]]; then
+    MPICH_VERSION=8.1.30
+else
+    MPICH_VERSION=8.1.28
+fi
 
 OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log
 
@@ -23,13 +32,13 @@ OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log
     module reset
     # load modules
     echo loading modules:
-    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3
-    module load cray-mpich/8.1.30
-    module load rocm/6.1.3
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION}
+    module load cray-mpich/${MPICH_VERSION}
+    module load rocm/${ROCM_VERSION}
     module list
 
     GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
-    EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE.x
+    EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE\_rocm-${ROCM_VERSION}.x
     NUM_TASKS=$(($NUM_NODES * 8))
     MIN_MSG_SIZE=$((1 * 1024))
     MAX_MSG_SIZE=$((1 * 1024 * 1024))
diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh
index 9ccecbd..c5348be 100644
--- a/gpu-benchmarks/gemm/run_frontier.sh
+++ b/gpu-benchmarks/gemm/run_frontier.sh
@@ -4,14 +4,23 @@
 # run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/gemm.sh <num_nodes> <output_dir>
 
 #!/bin/bash
-if [ "$#" -ne 2 ]; then
-    echo "Usage: $0 <number_of_nodes> <output_dir>"
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <rocm_version> <number_of_nodes> <output_dir>"
     exit 1
 fi
+# `5.7.1` or `6.1.3`
+ROCM_VERSION=$1
 # `16` or `64`
-NUM_NODES=$1
+NUM_NODES=$2
 # output directory
-OUTPUT_DIR=$2
+OUTPUT_DIR=$3
+
+# setup cray-mpich version
+if [[ "$ROCM_VERSION" == "6.1.3" ]]; then
+    MPICH_VERSION=8.1.30
+else
+    MPICH_VERSION=8.1.28
+fi
 
 OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log
 
@@ -21,13 +30,13 @@ OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log
     module reset
     # load modules
     echo loading modules:
-    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/6.1.3
-    module load cray-mpich/8.1.30
-    module load rocm/6.1.3
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION}
+    module load cray-mpich/${MPICH_VERSION}
+    module load rocm/${ROCM_VERSION}
     module list
 
     GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
-    EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm.x
+    EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm_rocm-${ROCM_VERSION}.x
     NUM_TASKS=$(($NUM_NODES * 8))
 
     export MPICH_GPU_SUPPORT_ENABLED=1

From a68f2a16f3b0a1cded21ddf8b7cafd46c293527a Mon Sep 17 00:00:00 2001
From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com>
Date: Mon, 13 Jan 2025 17:36:47 -0500
Subject: [PATCH 11/13] updated nanogpt scripts and reduced batch size due to
 HIP OOM errors

---
 nanoGPT/run_frontier16.sh             | 84 +++++++++++++--------------
 nanoGPT/run_frontier64.sh             | 84 +++++++++++++--------------
 nanoGPT/train_gpt_neox_5B_frontier.py |  4 +-
 3 files changed, 82 insertions(+), 90 deletions(-)

diff --git a/nanoGPT/run_frontier16.sh b/nanoGPT/run_frontier16.sh
index 63718c5..901561e 100644
--- a/nanoGPT/run_frontier16.sh
+++ b/nanoGPT/run_frontier16.sh
@@ -3,30 +3,19 @@
 #SBATCH -n 128
 #SBATCH -q normal
 #SBATCH -J nanogpt
-#SBATCH -t 01:00:00
+#SBATCH --gpu-bind none
+#SBATCH -t 00:30:00
 #SBATCH -A csc569
 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-output.log
 #SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-error.log
 #SBATCH --exclusive
 # Run like: sbatch run_frontier16.sh
 
+echo "start run: $(date)"
 export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
 OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log
 ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log
 
-# Run gpu benchmarks
-COMM_TYPE=rccl
-PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
-echo running allreduce benchmark
-bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
-# echo running allgather benchmark
-# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
-echo running gemm benchmark
-bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
-
-APP_ROOT=/lustre/orion/csc569/scratch/keshprad/nanoGPT
-cd $APP_ROOT
-
 export SCRATCH="/lustre/orion/csc569/scratch/keshprad"
 export WRKSPC="${SCRATCH}/nanoGPT"
 export HF_HOME="${SCRATCH}/.cache/hf"
@@ -35,56 +24,63 @@ export HF_DATASETS_CACHE="${HF_HOME}/datasets"
 cd $WRKSPC
 
 # load modules
-rocm_version=6.1.3
+ROCM_VERSION=6.1.3
+echo resetting modules:
 module reset
+echo loading modules:
 module load PrgEnv-gnu/8.5.0
-module load rocm/${rocm_version}
+module load rocm/${ROCM_VERSION}
 module load craype-accel-amd-gfx90a
 module load cray-python/3.9.13.1
-module load gcc-native/12.3
 module load cray-mpich/8.1.30
+module list
 # activate env
 source ${WRKSPC}/axonn_nanogpt/bin/activate
 
 NNODES=$SLURM_JOB_NUM_NODES
 GPUS=$(( NNODES * 8 ))
 ## master addr and port
-export MASTER_ADDR=$(hostname -i)
-export MASTER_PORT=3442
-export WORLD_SIZE=${GPUS}
+# setting variables for torch.distributed
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export WORLD_SIZE=$GPUS
+export OMP_NUM_THREADS=7
 
-## nccl env vars to speedup stuff
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NCCL_NET_GDR_LEVEL=PHB
-export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0
+## some RCCL env variables
+export FI_CXI_ATS=0
+export HSA_FORCE_FINE_GRAIN_PCIE=1
 export NCCL_CROSS_NIC=1
 export NCCL_SOCKET_IFNAME=hsn0
-export NCCL_NET="AWS Libfabric"
-export NCCL_TIMEOUT=1200
-export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=1200
-export MPICH_GPU_SUPPORT_ENABLED=0
+export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0
+export CUDA_DEVICE_MAX_CONNECTIONS=1
 # AWS-OFI-RCCL
 export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH"
+# other
+export MPICH_GPU_SUPPORT_ENABLED=1
+export GPU_MAX_HW_QUEUES=1
+export OFI_NCCL_USE_IPV6_TCP=1
 
 SCRIPT="train_frontier.py config/train_gpt_neox_5B.py"
 
-# run without profiler
-export WITH_PROFILER=0
-# log start date
-echo start nanoGPT_withoutprof: $(date)
-run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT_withoutprof.log"
-echo $run_cmd
-eval $run_cmd
-# log end date
-echo end nanoGPT_withoutprof: $(date)
-
-
 # run with profiler
 export WITH_PROFILER=1
+OUTPUT_FILE="$JOB_OUTPUT_PATH/output-nanoGPT.log"
 # log start date
-echo start nanoGPT: $(date)
-run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT.log"
-echo $run_cmd
-eval $run_cmd
+echo "start nanoGPT: $(date)" &>> $OUTPUT_FILE
+run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT"
+echo $run_cmd &>> $OUTPUT_FILE
+eval $run_cmd &>> $OUTPUT_FILE
 # log end date
-echo end nanoGPT: $(date)
+echo "end nanoGPT: $(date)" &>> $OUTPUT_FILE
+
+# Run gpu benchmarks
+COMM_TYPE=rccl
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+
+echo "end run: $(date)"
\ No newline at end of file
diff --git a/nanoGPT/run_frontier64.sh b/nanoGPT/run_frontier64.sh
index 1c9a75b..3201b51 100644
--- a/nanoGPT/run_frontier64.sh
+++ b/nanoGPT/run_frontier64.sh
@@ -3,30 +3,19 @@
 #SBATCH -n 512
 #SBATCH -q normal
 #SBATCH -J nanogpt
-#SBATCH -t 01:00:00
+#SBATCH --gpu-bind none
+#SBATCH -t 00:30:00
 #SBATCH -A csc569
 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-output.log
 #SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-error.log
 #SBATCH --exclusive
 # Run like: sbatch run_frontier64.sh
 
+echo "start run: $(date)"
 export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
 OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log
 ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log
 
-# Run gpu benchmarks
-COMM_TYPE=rccl
-PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
-echo running allreduce benchmark
-bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
-# echo running allgather benchmark
-# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
-echo running gemm benchmark
-bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
-
-APP_ROOT=/lustre/orion/csc569/scratch/keshprad/nanoGPT
-cd $APP_ROOT
-
 export SCRATCH="/lustre/orion/csc569/scratch/keshprad"
 export WRKSPC="${SCRATCH}/nanoGPT"
 export HF_HOME="${SCRATCH}/.cache/hf"
@@ -35,56 +24,63 @@ export HF_DATASETS_CACHE="${HF_HOME}/datasets"
 cd $WRKSPC
 
 # load modules
-rocm_version=6.1.3
+ROCM_VERSION=6.1.3
+echo resetting modules:
 module reset
+echo loading modules:
 module load PrgEnv-gnu/8.5.0
-module load rocm/${rocm_version}
+module load rocm/${ROCM_VERSION}
 module load craype-accel-amd-gfx90a
 module load cray-python/3.9.13.1
-module load gcc-native/12.3
 module load cray-mpich/8.1.30
+module list
 # activate env
 source ${WRKSPC}/axonn_nanogpt/bin/activate
 
 NNODES=$SLURM_JOB_NUM_NODES
 GPUS=$(( NNODES * 8 ))
 ## master addr and port
-export MASTER_ADDR=$(hostname -i)
-export MASTER_PORT=3442
-export WORLD_SIZE=${GPUS}
+# setting variables for torch.distributed
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export WORLD_SIZE=$GPUS
+export OMP_NUM_THREADS=7
 
-## nccl env vars to speedup stuff
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NCCL_NET_GDR_LEVEL=PHB
-export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0
+## some RCCL env variables
+export FI_CXI_ATS=0
+export HSA_FORCE_FINE_GRAIN_PCIE=1
 export NCCL_CROSS_NIC=1
 export NCCL_SOCKET_IFNAME=hsn0
-export NCCL_NET="AWS Libfabric"
-export NCCL_TIMEOUT=1200
-export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=1200
-export MPICH_GPU_SUPPORT_ENABLED=0
+export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0
+export CUDA_DEVICE_MAX_CONNECTIONS=1
 # AWS-OFI-RCCL
 export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH"
+# other
+export MPICH_GPU_SUPPORT_ENABLED=1
+export GPU_MAX_HW_QUEUES=1
+export OFI_NCCL_USE_IPV6_TCP=1
 
 SCRIPT="train_frontier.py config/train_gpt_neox_20B.py"
 
-# run without profiler
-export WITH_PROFILER=0
-# log start date
-echo start nanoGPT_withoutprof: $(date)
-run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT_withoutprof.log"
-echo $run_cmd
-eval $run_cmd
-# log end date
-echo end nanoGPT_withoutprof: $(date)
-
-
 # run with profiler
 export WITH_PROFILER=1
+OUTPUT_FILE="$JOB_OUTPUT_PATH/output-nanoGPT.log"
 # log start date
-echo start nanoGPT: $(date)
-run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT &> $JOB_OUTPUT_PATH/output-nanoGPT.log"
-echo $run_cmd
-eval $run_cmd
+echo "start nanoGPT: $(date)" &>> $OUTPUT_FILE
+run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT"
+echo $run_cmd &>> $OUTPUT_FILE
+eval $run_cmd &>> $OUTPUT_FILE
 # log end date
-echo end nanoGPT: $(date)
+echo "end nanoGPT: $(date)" &>> $OUTPUT_FILE
+
+# Run gpu benchmarks
+COMM_TYPE=rccl
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+
+echo "end run: $(date)"
\ No newline at end of file
diff --git a/nanoGPT/train_gpt_neox_5B_frontier.py b/nanoGPT/train_gpt_neox_5B_frontier.py
index 5fcc430..4ce7b55 100644
--- a/nanoGPT/train_gpt_neox_5B_frontier.py
+++ b/nanoGPT/train_gpt_neox_5B_frontier.py
@@ -8,9 +8,9 @@
 
 # these make the total batch size be ~0.5M
 # 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
-batch_size = 32
+batch_size = 16
 block_size = 512
-gradient_accumulation_steps = 1 * 128 #per_gpu x num_gpus
+gradient_accumulation_steps = 2 * 128 #per_gpu x num_gpus
 
 # model
 n_layer = 24

From 2d2689ca7190fe975e040f0de85d5e3431b5740d Mon Sep 17 00:00:00 2001
From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com>
Date: Mon, 13 Jan 2025 17:39:39 -0500
Subject: [PATCH 12/13] updated deepcam scripts to resolve nccl issues

---
 DeepCAM/run_frontier_16.sh | 79 +++++++++++++++++++-------------------
 DeepCAM/run_frontier_64.sh | 79 +++++++++++++++++++-------------------
 2 files changed, 80 insertions(+), 78 deletions(-)

diff --git a/DeepCAM/run_frontier_16.sh b/DeepCAM/run_frontier_16.sh
index e87f6e9..593608a 100644
--- a/DeepCAM/run_frontier_16.sh
+++ b/DeepCAM/run_frontier_16.sh
@@ -4,7 +4,7 @@
 #SBATCH -q normal
 #SBATCH -J deepcam
 #SBATCH --gpu-bind none
-#SBATCH -t 01:00:00
+#SBATCH -t 00:30:00
 #SBATCH -A csc569
 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/16nodes/%x-%j/job-output.log
 #SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/16nodes/%x-%j/job-error.log
@@ -16,24 +16,15 @@ export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_log
 OUTPUT_FILE=${JOB_OUTPUT_PATH}/output-deepcam.log
 ERROR_FILE=${JOB_OUTPUT_PATH}/error-deepcam.log
 
-# Run gpu benchmarks
-COMM_TYPE=rccl
-PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
-echo running allreduce benchmark
-bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
-# echo running allgather benchmark
-# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
-echo running gemm benchmark
-bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
-
-APP_ROOT=/lustre/orion/csc569/scratch/keshprad/deepcam
+export SCRATCH="/lustre/orion/csc569/scratch/keshprad"
+export APP_ROOT="${SCRATCH}/deepcam"
 APP_WORKING_DIR=${APP_ROOT}/hpc/deepcam/src/deepCam
 cd $APP_WORKING_DIR
 
-# reset modules
+# load modules
+ROCM_VERSION=6.1.3
 echo resetting modules:
 module reset
-# load modules
 echo loading modules:
 module load PrgEnv-gnu/8.5.0
 module load rocm/6.1.3
@@ -47,20 +38,36 @@ source ${APP_ROOT}/.venv/bin/activate
 
 # ENV variables
 echo setting env vars:
-mkdir -p ${JOB_OUTPUT_PATH}
-export OMP_NUM_THREADS=1
-export RUN_TAG="${SLURM_JOB_NAME}-${SLURM_JOB_ID}"
-export MASTER_ADDR=$(hostname -i)
-export MASTER_PORT=3442
-export NCCL_SOCKET_IFNAME=hsn0
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 8 ))
+
+## master addr and port
+# setting variables for torch.distributed
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export WORLD_SIZE=$GPUS
+export OMP_NUM_THREADS=7 
 
 # Needed to bypass MIOpen, Disk I/O Errors
 export MIOPEN_USER_DB_PATH="/tmp/my-miopen-cache-${SLURM_JOB_ID}"
 export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH}
 
-# Add AWS-OFI-RCCL
+## some RCCL env variables
+export FI_CXI_ATS=0
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn0
+export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+# AWS-OFI-RCCL
 export LD_LIBRARY_PATH=${APP_ROOT}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH
+# other
+export MPICH_GPU_SUPPORT_ENABLED=1
+export GPU_MAX_HW_QUEUES=1
+export OFI_NCCL_USE_IPV6_TCP=1
 
+# deepcam setup
+export RUN_TAG="${SLURM_JOB_NAME}-${SLURM_JOB_ID}"
 BENCH_RCP_FIXED="\
     --gradient_accumulation_frequency 1 \
     --logging_frequency 10 \
@@ -68,7 +75,6 @@ BENCH_RCP_FIXED="\
     --seed $(date +%s) \
     --batchnorm_group_size 1 \
     --target_iou 0.80"
-
 #BENCH_RCP_BASELINE_LR describes the learning rate for Baseline runs.
 #It should not be modified.
 BENCH_RCP_BASELINE_LR="\
@@ -78,13 +84,12 @@ BENCH_RCP_BASELINE_LR="\
     --lr_warmup_factor 1. \
     --weight_decay 1e-2 \
     --optimizer_betas 0.9 0.999"
-
 BENCH_RCP_BASELINE="\
     ${BENCH_RCP_FIXED} \
     ${BENCH_RCP_BASELINE_LR}"
 
 # define command
-MAX_EPOCHS=2
+MAX_EPOCHS=1
 cmd="srun --export=ALL --tasks-per-node=8 --gpus-per-node=8 \
         --gpu-bind=closest --gpus-per-task=1 \
         --cpu-bind=none --hint=nomultithread \
@@ -98,21 +103,6 @@ cmd="srun --export=ALL --tasks-per-node=8 --gpus-per-node=8 \
             --optimizer "Adam" \
             --local_batch_size 2"
 
-# run without profiler
-export WITH_PROFILER=0
-OUTPUT_FILE="$JOB_OUTPUT_PATH/output-deepcam_withoutprof.log"
-# clear cache
-rm -rf ${MIOPEN_USER_DB_PATH}
-mkdir -p ${MIOPEN_USER_DB_PATH}
-# log start date
-echo "start deepcam_withoutprof: $(date)" &>> $OUTPUT_FILE
-# execute command
-echo $cmd &>> $OUTPUT_FILE
-eval $cmd &>> $OUTPUT_FILE
-# log end date
-echo "end deepcam_withoutprof: $(date)" &>> $OUTPUT_FILE
-
-
 # run with profiler
 export WITH_PROFILER=1
 OUTPUT_FILE="$JOB_OUTPUT_PATH/output-deepcam.log"
@@ -128,4 +118,15 @@ eval $cmd &>> $OUTPUT_FILE
 echo "end deepcam: $(date)" &>> $OUTPUT_FILE
 
 rm -rf ${MIOPEN_USER_DB_PATH}
+
+# Run gpu benchmarks
+COMM_TYPE=rccl
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+
 echo "end run: $(date)"
\ No newline at end of file
diff --git a/DeepCAM/run_frontier_64.sh b/DeepCAM/run_frontier_64.sh
index fbe0b29..5c406fe 100644
--- a/DeepCAM/run_frontier_64.sh
+++ b/DeepCAM/run_frontier_64.sh
@@ -4,7 +4,7 @@
 #SBATCH -q normal
 #SBATCH -J deepcam
 #SBATCH --gpu-bind none
-#SBATCH -t 01:00:00
+#SBATCH -t 00:30:00
 #SBATCH -A csc569
 #SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/64nodes/%x-%j/job-output.log
 #SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/64nodes/%x-%j/job-error.log
@@ -16,24 +16,15 @@ export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_log
 OUTPUT_FILE=${JOB_OUTPUT_PATH}/output-deepcam.log
 ERROR_FILE=${JOB_OUTPUT_PATH}/error-deepcam.log
 
-# Run gpu benchmarks
-COMM_TYPE=rccl
-PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
-echo running allreduce benchmark
-bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
-# echo running allgather benchmark
-# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
-echo running gemm benchmark
-bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
-
-APP_ROOT=/lustre/orion/csc569/scratch/keshprad/deepcam
+export SCRATCH="/lustre/orion/csc569/scratch/keshprad"
+export APP_ROOT="${SCRATCH}/deepcam"
 APP_WORKING_DIR=${APP_ROOT}/hpc/deepcam/src/deepCam
 cd $APP_WORKING_DIR
 
-# reset modules
+# load modules
+ROCM_VERSION=6.1.3
 echo resetting modules:
 module reset
-# load modules
 echo loading modules:
 module load PrgEnv-gnu/8.5.0
 module load rocm/6.1.3
@@ -47,20 +38,36 @@ source ${APP_ROOT}/.venv/bin/activate
 
 # ENV variables
 echo setting env vars:
-mkdir -p ${JOB_OUTPUT_PATH}
-export OMP_NUM_THREADS=1
-export RUN_TAG="${SLURM_JOB_NAME}-${SLURM_JOB_ID}"
-export MASTER_ADDR=$(hostname -i)
-export MASTER_PORT=3442
-export NCCL_SOCKET_IFNAME=hsn0
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 8 ))
+
+## master addr and port
+# setting variables for torch.distributed
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export WORLD_SIZE=$GPUS
+export OMP_NUM_THREADS=7 
 
 # Needed to bypass MIOpen, Disk I/O Errors
 export MIOPEN_USER_DB_PATH="/tmp/my-miopen-cache-${SLURM_JOB_ID}"
 export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH}
 
-# Add AWS-OFI-RCCL
+## some RCCL env variables
+export FI_CXI_ATS=0
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn0
+export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+# AWS-OFI-RCCL
 export LD_LIBRARY_PATH=${APP_ROOT}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH
+# other
+export MPICH_GPU_SUPPORT_ENABLED=1
+export GPU_MAX_HW_QUEUES=1
+export OFI_NCCL_USE_IPV6_TCP=1
 
+# deepcam setup
+export RUN_TAG="${SLURM_JOB_NAME}-${SLURM_JOB_ID}"
 BENCH_RCP_FIXED="\
     --gradient_accumulation_frequency 1 \
     --logging_frequency 10 \
@@ -68,7 +75,6 @@ BENCH_RCP_FIXED="\
     --seed $(date +%s) \
     --batchnorm_group_size 1 \
     --target_iou 0.80"
-
 #BENCH_RCP_BASELINE_LR describes the learning rate for Baseline runs.
 #It should not be modified.
 BENCH_RCP_BASELINE_LR="\
@@ -78,13 +84,12 @@ BENCH_RCP_BASELINE_LR="\
     --lr_warmup_factor 1. \
     --weight_decay 1e-2 \
     --optimizer_betas 0.9 0.999"
-
 BENCH_RCP_BASELINE="\
     ${BENCH_RCP_FIXED} \
     ${BENCH_RCP_BASELINE_LR}"
 
 # define command
-MAX_EPOCHS=8
+MAX_EPOCHS=4
 cmd="srun --export=ALL --tasks-per-node=8 --gpus-per-node=8 \
         --gpu-bind=closest --gpus-per-task=1 \
         --cpu-bind=none --hint=nomultithread \
@@ -98,21 +103,6 @@ cmd="srun --export=ALL --tasks-per-node=8 --gpus-per-node=8 \
             --optimizer "Adam" \
             --local_batch_size 2"
 
-# run without profiler
-export WITH_PROFILER=0
-OUTPUT_FILE="$JOB_OUTPUT_PATH/output-deepcam_withoutprof.log"
-# clear cache
-rm -rf ${MIOPEN_USER_DB_PATH}
-mkdir -p ${MIOPEN_USER_DB_PATH}
-# log start date
-echo "start deepcam_withoutprof: $(date)" &>> $OUTPUT_FILE
-# execute command
-echo $cmd &>> $OUTPUT_FILE
-eval $cmd &>> $OUTPUT_FILE
-# log end date
-echo "end deepcam_withoutprof: $(date)" &>> $OUTPUT_FILE
-
-
 # run with profiler
 export WITH_PROFILER=1
 OUTPUT_FILE="$JOB_OUTPUT_PATH/output-deepcam.log"
@@ -128,4 +118,15 @@ eval $cmd &>> $OUTPUT_FILE
 echo "end deepcam: $(date)" &>> $OUTPUT_FILE
 
 rm -rf ${MIOPEN_USER_DB_PATH}
+
+# Run gpu benchmarks
+COMM_TYPE=rccl
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+
 echo "end run: $(date)"
\ No newline at end of file

From 94b59495c2bf67fd1c06d7f7341b1b7edce3f0f6 Mon Sep 17 00:00:00 2001
From: Keshav Pradeep <32313895+keshprad@users.noreply.github.com>
Date: Mon, 13 Jan 2025 17:41:38 -0500
Subject: [PATCH 13/13] milc scripts and params

---
 MILC/params_frontier.40.16   |  60 ++++
 MILC/params_frontier.40.64   |  60 ++++
 MILC/rat.m013m065m838        | 554 +++++++++++++++++++++++++++++++++++
 MILC/run_frontier_40.16.sh   |  76 +++++
 MILC/run_frontier_40.64.sh   |  76 +++++
 MILC/run_frontier_crontab.sh |  19 ++
 6 files changed, 845 insertions(+)
 create mode 100644 MILC/params_frontier.40.16
 create mode 100644 MILC/params_frontier.40.64
 create mode 100644 MILC/rat.m013m065m838
 create mode 100644 MILC/run_frontier_40.16.sh
 create mode 100644 MILC/run_frontier_40.64.sh
 create mode 100644 MILC/run_frontier_crontab.sh

diff --git a/MILC/params_frontier.40.16 b/MILC/params_frontier.40.16
new file mode 100644
index 0000000..3593cab
--- /dev/null
+++ b/MILC/params_frontier.40.16
@@ -0,0 +1,60 @@
+prompt 0
+nx 80
+ny 160
+nz 160
+nt 160
+node_geometry 2 4 4 4
+ionode_geometry 2 4 4 4
+iseed 5682304
+n_pseudo 5
+load_rhmc_params rat.m013m065m838
+beta 5.60
+n_dyn_masses 3
+dyn_mass 0.013 0.065 0.838
+dyn_flavors 2 1 1
+u0 0.85535
+
+warms 0
+trajecs 2
+traj_between_meas 2
+microcanonical_time_step 0.2
+steps_per_trajectory 2
+cgresid_md_fa_gr .000000025 .00000002 .00000002
+max_multicg_md_fa_gr  1750 1750 1750
+cgprec_md_fa_gr  2 2 2
+cgresid_md_fa_gr .00000005 .00000002 .00000002
+max_multicg_md_fa_gr  1750 1750 1750
+cgprec_md_fa_gr  2 2 2
+cgresid_md_fa_gr .00000005 .00000002 .00000002
+max_multicg_md_fa_gr  1750 1750 1750
+cgprec_md_fa_gr  2 2 2
+cgresid_md_fa_gr .00000005 .00000002 .00000002
+max_multicg_md_fa_gr  1750 1750 1750
+cgprec_md_fa_gr  2 2 2
+cgresid_md_fa_gr .000000005 .000000002 .000000002
+max_multicg_md_fa_gr  1750 1750 1750
+cgprec_md_fa_gr  2 2 2
+prec_ff 1
+
+number_of_pbp_masses 3
+max_cg_prop 1750
+max_cg_prop_restarts 5
+npbp_reps 1
+prec_pbp 2
+mass 0.013
+naik_term_epsilon 0
+error_for_propagator 2e-7
+rel_error_for_propagator 0
+mass 0.065
+naik_term_epsilon 0
+error_for_propagator 2e-7
+rel_error_for_propagator 0
+mass 0.838
+naik_term_epsilon -0.358197
+error_for_propagator 2e-8
+rel_error_for_propagator 0
+
+fresh
+#reload_serial l1216b560m013m065m838.test
+forget
+
diff --git a/MILC/params_frontier.40.64 b/MILC/params_frontier.40.64
new file mode 100644
index 0000000..73826ed
--- /dev/null
+++ b/MILC/params_frontier.40.64
@@ -0,0 +1,60 @@
+prompt 0
+nx 80
+ny 160
+nz 320
+nt 320
+node_geometry 2 4 8 8
+ionode_geometry 2 4 8 8
+iseed 5682304
+n_pseudo 5
+load_rhmc_params rat.m013m065m838
+beta 5.60
+n_dyn_masses 3
+dyn_mass 0.013 0.065 0.838
+dyn_flavors 2 1 1
+u0 0.85535
+
+warms 0
+trajecs 2
+traj_between_meas 2
+microcanonical_time_step 0.2
+steps_per_trajectory 2
+cgresid_md_fa_gr .000000025 .00000002 .00000002
+max_multicg_md_fa_gr  1750 1750 1750
+cgprec_md_fa_gr  2 2 2
+cgresid_md_fa_gr .00000005 .00000002 .00000002
+max_multicg_md_fa_gr  1750 1750 1750
+cgprec_md_fa_gr  2 2 2
+cgresid_md_fa_gr .00000005 .00000002 .00000002
+max_multicg_md_fa_gr  1750 1750 1750
+cgprec_md_fa_gr  2 2 2
+cgresid_md_fa_gr .00000005 .00000002 .00000002
+max_multicg_md_fa_gr  1750 1750 1750
+cgprec_md_fa_gr  2 2 2
+cgresid_md_fa_gr .000000005 .000000002 .000000002
+max_multicg_md_fa_gr  1750 1750 1750
+cgprec_md_fa_gr  2 2 2
+prec_ff 1
+
+number_of_pbp_masses 3
+max_cg_prop 1750
+max_cg_prop_restarts 5
+npbp_reps 1
+prec_pbp 2
+mass 0.013
+naik_term_epsilon 0
+error_for_propagator 2e-7
+rel_error_for_propagator 0
+mass 0.065
+naik_term_epsilon 0
+error_for_propagator 2e-7
+rel_error_for_propagator 0
+mass 0.838
+naik_term_epsilon -0.358197
+error_for_propagator 2e-8
+rel_error_for_propagator 0
+
+fresh
+#reload_serial l1216b560m013m065m838.test
+forget
+
diff --git a/MILC/rat.m013m065m838 b/MILC/rat.m013m065m838
new file mode 100644
index 0000000..af0212f
--- /dev/null
+++ b/MILC/rat.m013m065m838
@@ -0,0 +1,554 @@
+n_pseudo 5
+
+naik_term_epsilon 0
+
+
+
+# New rational function
+# Approximation bounds are [1.000000e-15,9.000000e+01]
+# Precision of arithmetic is 75
+# Degree of the approximation is (9,9)
+# Approximating the function (x+4*0.013000^2)^(2/4) (x+4*0.065000^2)^(1/4) (x+4*0.200000^2)^(-3/4) (x+4*99.900000^2)^(0/4)
+# Converged at 1215 iterations, error = 4.464654e-10
+
+
+# Rational function for MD
+y_MD -2 -1 3 0
+z_MD 4 4 4 4
+m_MD 0.013000 0.065000 0.200000 99.900000
+order_MD 9
+
+res_MD 1.0000000005312786e+00
+res_MD 5.1478424553065552e-03
+res_MD 6.1255750425527403e-03
+res_MD 8.2831942811878966e-03
+res_MD 1.2118242740786475e-02
+res_MD 1.8952479058480964e-02
+res_MD 2.9418638688841297e-02
+res_MD 1.8470115784153345e-02
+res_MD 1.2658218218545392e-02
+res_MD 4.2626859206241910e-03
+
+pole_MD 99.9
+pole_MD 7.0774135543816004e-04
+pole_MD 9.9827758499373954e-04
+pole_MD 1.8047757109780971e-03
+pole_MD 3.7411067368715600e-03
+pole_MD 8.1989400404790334e-03
+pole_MD 1.6961565896872453e-02
+pole_MD 3.6947931361687537e-02
+pole_MD 7.5128029017199366e-02
+pole_MD 1.2749566440898313e-01
+
+# CHECK: f(1.000000e-15) = 2.698640e+01 = 2.698640e+01?
+
+
+
+# New rational function
+# Approximation bounds are [1.000000e-15,9.000000e+01]
+# Precision of arithmetic is 75
+# Degree of the approximation is (11,11)
+# Approximating the function (x+4*0.013000^2)^(2/8) (x+4*0.065000^2)^(1/8) (x+4*0.200000^2)^(-3/8) (x+4*99.900000^2)^(0/8)
+# Converged at 1458 iterations, error = 3.338720e-12
+
+
+# Rational function for GR
+y_GR 2 1 -3 0
+z_GR 8 8 8 8
+m_GR 0.013000 0.065000 0.200000 99.900000
+order_GR 11
+
+res_GR 9.9999999999569866e-01
+res_GR -4.0060866788845987e-06
+res_GR -1.5999218783896372e-05
+res_GR -4.4655743629733473e-05
+res_GR -1.1314117169439806e-04
+res_GR -2.7282105670893862e-04
+res_GR -6.1067990531200617e-04
+res_GR -1.9139700349766983e-03
+res_GR -5.0187173668868809e-03
+res_GR -1.0796690978882167e-02
+res_GR -1.8360260474439800e-02
+res_GR -2.0567557871129686e-02
+
+pole_GR 99.9
+pole_GR 7.4665670046294302e-04
+pole_GR 1.0337590207549056e-03
+pole_GR 1.6932805716009024e-03
+pole_GR 3.0741725059849415e-03
+pole_GR 5.8941171906434549e-03
+pole_GR 1.1579298991070606e-02
+pole_GR 2.3594965292107566e-02
+pole_GR 4.3836643614606971e-02
+pole_GR 7.6692462791476457e-02
+pole_GR 1.1910887316297561e-01
+pole_GR 1.5357310472237296e-01
+
+# CHECK: f(1.000000e-15) = 1.924986e-01 = 1.924986e-01?
+
+
+# Rational function for FA
+y_FA -2 -1 3 0
+z_FA 8 8 8 8
+m_FA 0.013000 0.065000 0.200000 99.900000
+order_FA 11
+
+res_FA 1.0000000000043012e+00
+res_FA 2.7624276919541819e-04
+res_FA 5.2622338238058285e-04
+res_FA 8.8065939393702753e-04
+res_FA 1.4738849748348461e-03
+res_FA 2.5034718455081732e-03
+res_FA 4.4044075660614667e-03
+res_FA 9.0907008499393795e-03
+res_FA 1.1167054802824348e-02
+res_FA 1.2487688569309182e-02
+res_FA 1.0529250690731072e-02
+res_FA 4.3789150648981231e-03
+
+pole_FA 99.9
+pole_FA 7.1219307391404054e-04
+pole_FA 9.3554136557657695e-04
+pole_FA 1.4789436727802529e-03
+pole_FA 2.6307764047071667e-03
+pole_FA 4.9930192831341728e-03
+pole_FA 9.7679165430922116e-03
+pole_FA 1.8753770710693469e-02
+pole_FA 3.4890062285776381e-02
+pole_FA 6.2841673100822434e-02
+pole_FA 1.0283414918156956e-01
+pole_FA 1.4327078903261969e-01
+
+# CHECK: f(1.000000e-15) = 5.194844e+00 = 5.194844e+00?
+naik_term_epsilon 0
+
+
+
+# New rational function
+# Approximation bounds are [1.000000e-15,9.000000e+01]
+# Precision of arithmetic is 75
+# Degree of the approximation is (7,7)
+# Approximating the function (x+4*0.200000^2)^(1/4) (x+4*99.900000^2)^(0/4) (x+4*99.900000^2)^(0/4) (x+4*99.900000^2)^(0/4)
+# Converged at 327 iterations, error = 2.398230e-07
+
+
+# Rational function for MD
+y_MD -1 0 0 0
+z_MD 4 4 4 4
+m_MD 0.200000 99.900000 99.900000 99.900000
+order_MD 7
+
+res_MD 1.4922969612472456e-01
+res_MD 4.6061009721530329e-02
+res_MD 1.1379997711196768e-01
+res_MD 2.7453631915038118e-01
+res_MD 6.8761853150948216e-01
+res_MD 1.8320055253741194e+00
+res_MD 5.8748098933529755e+00
+res_MD 3.8086202810075271e+01
+
+pole_MD 99.9
+pole_MD 1.8528257058192529e-01
+pole_MD 3.7539911712311358e-01
+pole_MD 1.0581221519179564e+00
+pole_MD 3.4031251236671252e+00
+pole_MD 1.1740502300224540e+01
+pole_MD 4.5730041178720789e+01
+pole_MD 2.8391564624137345e+02
+
+# CHECK: f(1.000000e-15) = 1.581138e+00 = 1.581139e+00?
+
+
+
+# New rational function
+# Approximation bounds are [1.000000e-15,9.000000e+01]
+# Precision of arithmetic is 75
+# Degree of the approximation is (9,9)
+# Approximating the function (x+4*0.200000^2)^(1/8) (x+4*99.900000^2)^(0/8) (x+4*99.900000^2)^(0/8) (x+4*99.900000^2)^(0/8)
+# Converged at 422 iterations, error = 1.700228e-09
+
+
+# Rational function for GR
+y_GR 1 0 0 0
+z_GR 8 8 8 8
+m_GR 0.200000 99.900000 99.900000 99.900000
+order_GR 9
+
+res_GR 2.7327733614742757e+00
+res_GR -5.1288622160737063e-03
+res_GR -2.0478919078421993e-02
+res_GR -6.3951352353102550e-02
+res_GR -1.9268600508174474e-01
+res_GR -5.8519459892515535e-01
+res_GR -1.8580289566259904e+00
+res_GR -6.6894626972932318e+00
+res_GR -3.4401695696851874e+01
+res_GR -6.1789841402475429e+02
+
+pole_GR 99.9
+pole_GR 1.8642524371106700e-01
+pole_GR 3.1582609770026626e-01
+pole_GR 6.7910597147834129e-01
+pole_GR 1.6420879353554103e+00
+pole_GR 4.2025399779967945e+00
+pole_GR 1.1215270946596673e+01
+pole_GR 3.2072436285328692e+01
+pole_GR 1.1032630752536383e+02
+pole_GR 7.6441549844451458e+02
+
+# CHECK: f(1.000000e-15) = 7.952707e-01 = 7.952707e-01?
+
+
+# Rational function for FA
+y_FA -1 0 0 0
+z_FA 8 8 8 8
+m_FA 0.200000 99.900000 99.900000 99.900000
+order_FA 9
+
+res_FA 3.6592862551196720e-01
+res_FA 1.0931558584388677e-02
+res_FA 2.9290340023195943e-02
+res_FA 6.7875566115597738e-02
+res_FA 1.5709264288335881e-01
+res_FA 3.7002733866738813e-01
+res_FA 9.0645109653452494e-01
+res_FA 2.4523352073991669e+00
+res_FA 8.7150280660454786e+00
+res_FA 7.5897099555179366e+01
+
+pole_FA 99.9
+pole_FA 1.7887536305510454e-01
+pole_FA 2.9094384593655243e-01
+pole_FA 6.1203693854712271e-01
+pole_FA 1.4648617324427462e+00
+pole_FA 3.7284495585739217e+00
+pole_FA 9.8932956134892081e+00
+pole_FA 2.7949316233288556e+01
+pole_FA 9.2734993617230558e+01
+pole_FA 5.4606225005034105e+02
+
+# CHECK: f(1.000000e-15) = 1.257433e+00 = 1.257433e+00?
+naik_term_epsilon 0
+
+
+
+# New rational function
+# Approximation bounds are [1.000000e-15,9.000000e+01]
+# Precision of arithmetic is 75
+# Degree of the approximation is (7,7)
+# Approximating the function (x+4*0.200000^2)^(1/4) (x+4*99.900000^2)^(0/4) (x+4*99.900000^2)^(0/4) (x+4*99.900000^2)^(0/4)
+# Converged at 327 iterations, error = 2.398230e-07
+
+
+# Rational function for MD
+y_MD -1 0 0 0
+z_MD 4 4 4 4
+m_MD 0.200000 99.900000 99.900000 99.900000
+order_MD 7
+
+res_MD 1.4922969612472456e-01
+res_MD 4.6061009721530329e-02
+res_MD 1.1379997711196768e-01
+res_MD 2.7453631915038118e-01
+res_MD 6.8761853150948216e-01
+res_MD 1.8320055253741194e+00
+res_MD 5.8748098933529755e+00
+res_MD 3.8086202810075271e+01
+
+pole_MD 99.9
+pole_MD 1.8528257058192529e-01
+pole_MD 3.7539911712311358e-01
+pole_MD 1.0581221519179564e+00
+pole_MD 3.4031251236671252e+00
+pole_MD 1.1740502300224540e+01
+pole_MD 4.5730041178720789e+01
+pole_MD 2.8391564624137345e+02
+
+# CHECK: f(1.000000e-15) = 1.581138e+00 = 1.581139e+00?
+
+
+
+# New rational function
+# Approximation bounds are [1.000000e-15,9.000000e+01]
+# Precision of arithmetic is 75
+# Degree of the approximation is (9,9)
+# Approximating the function (x+4*0.200000^2)^(1/8) (x+4*99.900000^2)^(0/8) (x+4*99.900000^2)^(0/8) (x+4*99.900000^2)^(0/8)
+# Converged at 422 iterations, error = 1.700228e-09
+
+
+# Rational function for GR
+y_GR 1 0 0 0
+z_GR 8 8 8 8
+m_GR 0.200000 99.900000 99.900000 99.900000
+order_GR 9
+
+res_GR 2.7327733614742757e+00
+res_GR -5.1288622160737063e-03
+res_GR -2.0478919078421993e-02
+res_GR -6.3951352353102550e-02
+res_GR -1.9268600508174474e-01
+res_GR -5.8519459892515535e-01
+res_GR -1.8580289566259904e+00
+res_GR -6.6894626972932318e+00
+res_GR -3.4401695696851874e+01
+res_GR -6.1789841402475429e+02
+
+pole_GR 99.9
+pole_GR 1.8642524371106700e-01
+pole_GR 3.1582609770026626e-01
+pole_GR 6.7910597147834129e-01
+pole_GR 1.6420879353554103e+00
+pole_GR 4.2025399779967945e+00
+pole_GR 1.1215270946596673e+01
+pole_GR 3.2072436285328692e+01
+pole_GR 1.1032630752536383e+02
+pole_GR 7.6441549844451458e+02
+
+# CHECK: f(1.000000e-15) = 7.952707e-01 = 7.952707e-01?
+
+
+# Rational function for FA
+y_FA -1 0 0 0
+z_FA 8 8 8 8
+m_FA 0.200000 99.900000 99.900000 99.900000
+order_FA 9
+
+res_FA 3.6592862551196720e-01
+res_FA 1.0931558584388677e-02
+res_FA 2.9290340023195943e-02
+res_FA 6.7875566115597738e-02
+res_FA 1.5709264288335881e-01
+res_FA 3.7002733866738813e-01
+res_FA 9.0645109653452494e-01
+res_FA 2.4523352073991669e+00
+res_FA 8.7150280660454786e+00
+res_FA 7.5897099555179366e+01
+
+pole_FA 99.9
+pole_FA 1.7887536305510454e-01
+pole_FA 2.9094384593655243e-01
+pole_FA 6.1203693854712271e-01
+pole_FA 1.4648617324427462e+00
+pole_FA 3.7284495585739217e+00
+pole_FA 9.8932956134892081e+00
+pole_FA 2.7949316233288556e+01
+pole_FA 9.2734993617230558e+01
+pole_FA 5.4606225005034105e+02
+
+# CHECK: f(1.000000e-15) = 1.257433e+00 = 1.257433e+00?
+naik_term_epsilon 0
+
+
+
+# New rational function
+# Approximation bounds are [1.000000e-15,9.000000e+01]
+# Precision of arithmetic is 75
+# Degree of the approximation is (7,7)
+# Approximating the function (x+4*0.200000^2)^(1/4) (x+4*99.900000^2)^(0/4) (x+4*99.900000^2)^(0/4) (x+4*99.900000^2)^(0/4)
+# Converged at 327 iterations, error = 2.398230e-07
+
+
+# Rational function for MD
+y_MD -1 0 0 0
+z_MD 4 4 4 4
+m_MD 0.200000 99.900000 99.900000 99.900000
+order_MD 7
+
+res_MD 1.4922969612472456e-01
+res_MD 4.6061009721530329e-02
+res_MD 1.1379997711196768e-01
+res_MD 2.7453631915038118e-01
+res_MD 6.8761853150948216e-01
+res_MD 1.8320055253741194e+00
+res_MD 5.8748098933529755e+00
+res_MD 3.8086202810075271e+01
+
+pole_MD 99.9
+pole_MD 1.8528257058192529e-01
+pole_MD 3.7539911712311358e-01
+pole_MD 1.0581221519179564e+00
+pole_MD 3.4031251236671252e+00
+pole_MD 1.1740502300224540e+01
+pole_MD 4.5730041178720789e+01
+pole_MD 2.8391564624137345e+02
+
+# CHECK: f(1.000000e-15) = 1.581138e+00 = 1.581139e+00?
+
+
+
+# New rational function
+# Approximation bounds are [1.000000e-15,9.000000e+01]
+# Precision of arithmetic is 75
+# Degree of the approximation is (9,9)
+# Approximating the function (x+4*0.200000^2)^(1/8) (x+4*99.900000^2)^(0/8) (x+4*99.900000^2)^(0/8) (x+4*99.900000^2)^(0/8)
+# Converged at 422 iterations, error = 1.700228e-09
+
+
+# Rational function for GR
+y_GR 1 0 0 0
+z_GR 8 8 8 8
+m_GR 0.200000 99.900000 99.900000 99.900000
+order_GR 9
+
+res_GR 2.7327733614742757e+00
+res_GR -5.1288622160737063e-03
+res_GR -2.0478919078421993e-02
+res_GR -6.3951352353102550e-02
+res_GR -1.9268600508174474e-01
+res_GR -5.8519459892515535e-01
+res_GR -1.8580289566259904e+00
+res_GR -6.6894626972932318e+00
+res_GR -3.4401695696851874e+01
+res_GR -6.1789841402475429e+02
+
+pole_GR 99.9
+pole_GR 1.8642524371106700e-01
+pole_GR 3.1582609770026626e-01
+pole_GR 6.7910597147834129e-01
+pole_GR 1.6420879353554103e+00
+pole_GR 4.2025399779967945e+00
+pole_GR 1.1215270946596673e+01
+pole_GR 3.2072436285328692e+01
+pole_GR 1.1032630752536383e+02
+pole_GR 7.6441549844451458e+02
+
+# CHECK: f(1.000000e-15) = 7.952707e-01 = 7.952707e-01?
+
+
+# Rational function for FA
+y_FA -1 0 0 0
+z_FA 8 8 8 8
+m_FA 0.200000 99.900000 99.900000 99.900000
+order_FA 9
+
+res_FA 3.6592862551196720e-01
+res_FA 1.0931558584388677e-02
+res_FA 2.9290340023195943e-02
+res_FA 6.7875566115597738e-02
+res_FA 1.5709264288335881e-01
+res_FA 3.7002733866738813e-01
+res_FA 9.0645109653452494e-01
+res_FA 2.4523352073991669e+00
+res_FA 8.7150280660454786e+00
+res_FA 7.5897099555179366e+01
+
+pole_FA 99.9
+pole_FA 1.7887536305510454e-01
+pole_FA 2.9094384593655243e-01
+pole_FA 6.1203693854712271e-01
+pole_FA 1.4648617324427462e+00
+pole_FA 3.7284495585739217e+00
+pole_FA 9.8932956134892081e+00
+pole_FA 2.7949316233288556e+01
+pole_FA 9.2734993617230558e+01
+pole_FA 5.4606225005034105e+02
+
+# CHECK: f(1.000000e-15) = 1.257433e+00 = 1.257433e+00?
+naik_term_epsilon -0.358197
+
+
+
+# New rational function
+# Approximation bounds are [1.000000e-15,9.000000e+01]
+# Precision of arithmetic is 75
+# Degree of the approximation is (7,7)
+# Approximating the function (x+4*0.838000^2)^(1/4) (x+4*99.900000^2)^(0/4) (x+4*99.900000^2)^(0/4) (x+4*99.900000^2)^(0/4)
+# Converged at 254 iterations, error = 1.451256e-10
+
+
+# Rational function for MD
+y_MD -1 0 0 0
+z_MD 4 4 4 4
+m_MD 0.838000 99.900000 99.900000 99.900000
+order_MD 7
+
+res_MD 1.2322471525321263e-01
+res_MD 2.1771921408584441e-01
+res_MD 4.4206637506002772e-01
+res_MD 8.2618794992683819e-01
+res_MD 1.6297128076156222e+00
+res_MD 3.6167435981445921e+00
+res_MD 1.0509605478166112e+01
+res_MD 6.7097093571965161e+01
+
+pole_MD 99.9
+pole_MD 3.0160974627586552e+00
+pole_MD 4.3550360604645304e+00
+pole_MD 7.9796160098146069e+00
+pole_MD 1.7084127350226733e+01
+pole_MD 4.1574237400987570e+01
+pole_MD 1.2322623145255724e+02
+pole_MD 6.4326627115402380e+02
+
+# CHECK: f(1.000000e-15) = 7.724369e-01 = 7.724369e-01?
+
+
+
+# New rational function
+# Approximation bounds are [1.000000e-15,9.000000e+01]
+# Precision of arithmetic is 75
+# Degree of the approximation is (9,9)
+# Approximating the function (x+4*0.838000^2)^(1/8) (x+4*99.900000^2)^(0/8) (x+4*99.900000^2)^(0/8) (x+4*99.900000^2)^(0/8)
+# Converged at 328 iterations, error = 1.426221e-13
+
+
+# Rational function for GR
+y_GR 1 0 0 0
+z_GR 8 8 8 8
+m_GR 0.838000 99.900000 99.900000 99.900000
+order_GR 9
+
+res_GR 3.0073112563922351e+00
+res_GR -5.3234413512341650e-02
+res_GR -1.7673165828433515e-01
+res_GR -4.2726410063882991e-01
+res_GR -9.7790664416295114e-01
+res_GR -2.2882846047810173e+00
+res_GR -5.8208997626948111e+00
+res_GR -1.7844293878766898e+01
+res_GR -8.4032413065091191e+01
+res_GR -1.4669209010508403e+03
+
+pole_GR 99.9
+pole_GR 3.0252410237704845e+00
+pole_GR 3.9660872619800918e+00
+pole_GR 6.1039445287657701e+00
+pole_GR 1.0530577643916541e+01
+pole_GR 1.9759567382759283e+01
+pole_GR 4.0165388592627771e+01
+pole_GR 9.1668640228708554e+01
+pole_GR 2.6672834021452144e+02
+pole_GR 1.6783103311924824e+03
+
+# CHECK: f(1.000000e-15) = 1.137807e+00 = 1.137807e+00?
+
+
+# Rational function for FA
+y_FA -1 0 0 0
+z_FA 8 8 8 8
+m_FA 0.838000 99.900000 99.900000 99.900000
+order_FA 9
+
+res_FA 3.3252294649396041e-01
+res_FA 6.7957577334402247e-02
+res_FA 1.5690077204608199e-01
+res_FA 2.9390758191833499e-01
+res_FA 5.4247602885208523e-01
+res_FA 1.0357646568428991e+00
+res_FA 2.1330614641295562e+00
+res_FA 5.1181689865260589e+00
+res_FA 1.7166623006718723e+01
+res_FA 1.4802690454646185e+02
+
+pole_FA 99.9
+pole_FA 2.9645701362629495e+00
+pole_FA 3.7967708400810514e+00
+pole_FA 5.7427951679138749e+00
+pole_FA 9.7876489526599268e+00
+pole_FA 1.8188863360964667e+01
+pole_FA 3.6571166564954488e+01
+pole_FA 8.1929059816469248e+01
+pole_FA 2.2810985650690961e+02
+pole_FA 1.2082659732685304e+03
+
+# CHECK: f(1.000000e-15) = 8.788839e-01 = 8.788839e-01?
diff --git a/MILC/run_frontier_40.16.sh b/MILC/run_frontier_40.16.sh
new file mode 100644
index 0000000..0f76532
--- /dev/null
+++ b/MILC/run_frontier_40.16.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+#SBATCH --job-name=milc_40.16
+#SBATCH --account=csc569
+#SBATCH --partition=batch
+#SBATCH --qos=normal
+#SBATCH --nodes=16
+#SBATCH --ntasks=128
+#SBATCH --gpu-bind=none
+#SBATCH --exclusive
+#SBATCH -t 00:30:00
+#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/MILC_logs/16nodes/%x-%j/job-output.log
+#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/MILC_logs/16nodes/%x-%j/job-error.log
+# RUN LIKE: sbatch run_frontier_40.16.sh
+
+echo "start run: $(date)"
+
+ORION_SCRATCH=/lustre/orion/csc569/scratch/keshprad
+OUTPUT_DIR=$ORION_SCRATCH/perfvar/MILC_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
+MILC_OUTPUT_FILE=$OUTPUT_DIR/output-MILC.log
+
+# Run gpu benchmarks
+COMM_TYPE=mpi
+ROCM_VERSION=6.1.3
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+
+# define paths variables
+BENCH_TOPDIR=/ccs/home/keshprad/MILC/OLCF-6_MILC_benchmark
+MILC_QCD_DIR=${BENCH_TOPDIR}/build/milc_qcd
+exe=${MILC_QCD_DIR}/ks_imp_rhmc/su3_rhmd_hisq
+input=$PERF_VARIABILITY_ROOT/MILC/params_frontier.40.16
+# Load modules, setup environment
+source ${BENCH_TOPDIR}/build/env.sh
+
+# Define environment variables
+# mpich
+export MPICH_RDMA_ENABLED_CUDA=1
+export MPICH_GPU_SUPPORT_ENABLED=1
+export MPICH_NEMESIS_ASYNC_PROGRESS=1
+# quda
+export QUDA_ENABLE_GDR=1
+export QUDA_ENABLE_P2P=1
+export QUDA_MILC_HISQ_RECONSTRUCT=13
+export QUDA_MILC_HISQ_RECONSTRUCT_SLOPPY=9
+# omp
+export OMP_NUM_THREADS=7
+export OMP_PROC_BIND="spread, spread, spread"
+export SLURM_CPU_BIND="cores"
+
+# qudatune
+# Tuning results are stored in qudatune_dir.
+export QUDA_RESOURCE_PATH="$ORION_SCRATCH/perfvar/MILC/qudatune_40.16"
+if [ ! -d ${QUDA_RESOURCE_PATH} ]; then
+    mkdir -p ${QUDA_RESOURCE_PATH}
+fi
+
+# mpiP
+export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP_rocm-${CRAY_ROCM_VERSION}:$LD_LIBRARY_PATH
+export MPIP="-o -f $OUTPUT_DIR"
+export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP_rocm-${CRAY_ROCM_VERSION}/libmpiP.so
+
+# run milc
+cd $PERF_VARIABILITY_ROOT/MILC/
+command="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \
+    -n $SLURM_NTASKS -c 7 \
+    $exe $input $MILC_OUTPUT_FILE"
+echo running milc
+echo $command &>> $MILC_OUTPUT_FILE
+eval $command &>> $MILC_OUTPUT_FILE
+
+echo end run: $(date)
\ No newline at end of file
diff --git a/MILC/run_frontier_40.64.sh b/MILC/run_frontier_40.64.sh
new file mode 100644
index 0000000..3bbbebe
--- /dev/null
+++ b/MILC/run_frontier_40.64.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+#SBATCH --job-name=milc_40.64
+#SBATCH --account=csc569
+#SBATCH --partition=batch
+#SBATCH --qos=normal
+#SBATCH --nodes=64
+#SBATCH --ntasks=512
+#SBATCH --gpu-bind=none
+#SBATCH --exclusive
+#SBATCH -t 00:30:00
+#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/MILC_logs/64nodes/%x-%j/job-output.log
+#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/MILC_logs/64nodes/%x-%j/job-error.log
+# RUN LIKE: sbatch run_frontier_40.64.sh
+
+echo "start run: $(date)"
+
+ORION_SCRATCH=/lustre/orion/csc569/scratch/keshprad
+OUTPUT_DIR=$ORION_SCRATCH/perfvar/MILC_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
+MILC_OUTPUT_FILE=$OUTPUT_DIR/output-MILC.log
+
+# Run gpu benchmarks
+COMM_TYPE=mpi
+ROCM_VERSION=6.1.3
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+
+# define paths variables
+BENCH_TOPDIR=/ccs/home/keshprad/MILC/OLCF-6_MILC_benchmark
+MILC_QCD_DIR=${BENCH_TOPDIR}/build/milc_qcd
+exe=${MILC_QCD_DIR}/ks_imp_rhmc/su3_rhmd_hisq
+input=$PERF_VARIABILITY_ROOT/MILC/params_frontier.40.64
+# Load modules, setup environment
+source ${BENCH_TOPDIR}/build/env.sh
+
+# Define environment variables
+# mpich
+export MPICH_RDMA_ENABLED_CUDA=1
+export MPICH_GPU_SUPPORT_ENABLED=1
+export MPICH_NEMESIS_ASYNC_PROGRESS=1
+# quda
+export QUDA_ENABLE_GDR=1
+export QUDA_ENABLE_P2P=1
+export QUDA_MILC_HISQ_RECONSTRUCT=13
+export QUDA_MILC_HISQ_RECONSTRUCT_SLOPPY=9
+# omp
+export OMP_NUM_THREADS=7
+export OMP_PROC_BIND="spread, spread, spread"
+export SLURM_CPU_BIND="cores"
+
+# qudatune
+# Tuning results are stored in qudatune_dir.
+export QUDA_RESOURCE_PATH="$ORION_SCRATCH/perfvar/MILC/qudatune_40.64"
+if [ ! -d ${QUDA_RESOURCE_PATH} ]; then
+    mkdir -p ${QUDA_RESOURCE_PATH}
+fi
+
+# mpiP
+export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP_rocm-${CRAY_ROCM_VERSION}:$LD_LIBRARY_PATH
+export MPIP="-o -f $OUTPUT_DIR"
+export MPIP_DLL_PATH=/ccs/home/keshprad/mpiP_rocm-${CRAY_ROCM_VERSION}/libmpiP.so
+
+# log date
+cd $PERF_VARIABILITY_ROOT/MILC/
+command="srun --export=ALL,LD_PRELOAD=$MPIP_DLL_PATH \
+    -n $SLURM_NTASKS -c 7 \
+    $exe $input $MILC_OUTPUT_FILE"
+echo running milc
+echo $command &>> $MILC_OUTPUT_FILE
+eval $command &>> $MILC_OUTPUT_FILE
+
+echo end run: $(date)
\ No newline at end of file
diff --git a/MILC/run_frontier_crontab.sh b/MILC/run_frontier_crontab.sh
new file mode 100644
index 0000000..8aa0ef3
--- /dev/null
+++ b/MILC/run_frontier_crontab.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <number_of_nodes>"
+    exit 1
+fi
+# `16` or `64`
+NUM_NODES=$1
+
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+
+# load lmod
+source /usr/share/lmod/lmod/init/bash
+# load default LMOD_SYSTEM_DEFAULT_MODULES and MODULEPATH
+export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-trento:craype-network-ofi:perftools-base:xpmem:cray-pmi:PrgEnv-cray:DefApps
+export MODULEPATH=/sw/frontier/spack-envs/modules/cce/17.0.0/cray-mpich-8.1.28/cce-17.0.0:/sw/frontier/spack-envs/modules/cce/17.0.0/cce-17.0.0:/sw/frontier/spack-envs/modules/Core/24.07:/opt/cray/pe/lmod/modulefiles/mpi/crayclang/17.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/crayclang/17.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/compiler/crayclang/17.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-trento/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/sw/frontier/modulefiles:/opt/cray/modulefiles
+
+# run sbatch script
+script=$PERF_VARIABILITY_ROOT/MILC/run_frontier_40.$NUM_NODES\.sh
+sbatch $script
\ No newline at end of file