diff --git a/AMG2023/README.md b/AMG2023/README.md
index 476ad56..14c75c8 100644
--- a/AMG2023/README.md
+++ b/AMG2023/README.md
@@ -1,9 +1,9 @@
 # AMG2023 README
 For more detailed installation parameters, please refer to the [installation document](https://github.com/pssg-int/AMG2023/blob/main/amg-doc.pdf).
 
-## Perlmutter Compilation
+Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/)
 
-Repository: [AMG2023](https://github.com/pssg-int/AMG2023)
+## Perlmutter Compilation
 
 ### Steps to Compile
 
@@ -50,5 +50,61 @@ Repository: [AMG2023](https://github.com/pssg-int/AMG2023)
     cmake -DHYPRE_PREFIX=/pscratch/sd/c/cunyang/AMG2023 ..
     ```
 
-## Frontier Installation
+## Frontier Compilation
+
+### Steps to Compile
+
+1. Load modules
+    ```sh
+    module reset
+
+    module load cray-mpich/8.1.30
+    module load craype-accel-amd-gfx90a
+    module load rocm/6.1.3
+    export MPICH_GPU_SUPPORT_ENABLED=1
+
+    # load compatible cmake version
+    module load Core/24.07
+    module load cmake/3.27.9
+    ```
+2. Configure hypre (v2.32.0)
+    - Clone hypre v2.32.0 and navigate to src: 
+        ```sh
+        git clone -b v2.32.0 https://github.com/hypre-space/hypre.git
+        cd into ~/hypre/src
+        ```
+    - Configure hypre (in hypre/src)
+        ```sh
+        ./configure --with-hip --enable-device-memory-pool --enable-mixedint --with-gpu-arch=gfx90a \
+            --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" \
+            --with-MPI-include="${MPICH_DIR}/include" \
+            CFLAGS="-I${ROCM_PATH}/include/ -I${ROCM_PATH}/llvm/include/ -I${ROCM_PATH}/include/rocsparse/" \
+            LDFLAGS="-L${ROCM_PATH}/lib/ -L${ROCM_PATH}/llvm/lib/ -lrocsparse"
+        ```
+    - Compile hypre (in hypre/src)
+        ```sh
+        # build with make
+        make
+        ```
+3. Configure AMG2023
+    - Clone repo: 
+        ```sh
+        git clone https://github.com/pssg-int/AMG2023`
+        cd AMG2023
+        ```
+    - Add mpiP to LD_LIBRARY_PATH
+        ```sh
+        export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH
+        ```
+    - Configure cmake
+        ```sh
+        mkdir build && cd build
 
+        cmake .. -DHYPRE_PREFIX=/ccs/home/keshprad/hypre/src/hypre/ \
+            -DCMAKE_C_FLAGS="-I${ROCM_PATH}/include/ -I${ROCM_PATH}/llvm/include/ -I${ROCM_PATH}/include/rocsparse/" \
+            -DCMAKE_EXE_LINKER_FLAGS="-L${ROCM_PATH}/lib/ -L${ROCM_PATH}/llvm/lib/ -lrocsparse -lrocrand"
+        ```
+    - Compile AMG2023 (in AMG2023/build)
+        ```sh
+        make install
+        ```
diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh
new file mode 100644
index 0000000..c51b52d
--- /dev/null
+++ b/AMG2023/run_frontier_16.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+#SBATCH -N 16
+#SBATCH -n 128
+#SBATCH -q normal
+#SBATCH -J amg
+#SBATCH --gpu-bind none
+#SBATCH -t 00:30:00
+#SBATCH -A csc569
+#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/%x-%j/output-AMG2023.log
+#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/%x-%j/error-AMG2023.log
+#SBATCH --exclusive
+# Run like: sbatch run_frontier_16.sh
+
+OUTPUT_DIR=/lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
+OUTPUT_FILE=$OUTPUT_DIR/output-AMG2023.log
+ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log
+
+# Run gpu benchmarks
+COMM_TYPE=mpi
+ROCM_VERSION=6.1.3
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+
+APP_ROOT=/ccs/home/keshprad/AMG2023
+cd $APP_ROOT
+
+# reset modules
+echo resetting modules:
+module reset
+# load modules
+echo loading modules:
+module load cray-mpich/8.1.30
+module load craype-accel-amd-gfx90a
+module load rocm/6.1.3
+
+export MPICH_GPU_SUPPORT_ENABLED=1
+export CRAY_ACCEL_TARGET=gfx90a
+export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/
+# mpiP
+export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH
+export MPIP="-o -f $OUTPUT_DIR"
+
+# log start date
+echo start AMG2023: $(date)
+# define command
+cmd="srun --output $OUTPUT_FILE --error $ERROR_FILE \
+        ./build/amg -P 4 4 8 -n 128 64 64 -problem 1 -iter 500"
+echo solving:
+echo $cmd
+$cmd
+# log end date
+echo end AMG2023: $(date)
diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh
new file mode 100644
index 0000000..c7a7a3e
--- /dev/null
+++ b/AMG2023/run_frontier_64.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+#SBATCH -N 64
+#SBATCH -n 512
+#SBATCH -q normal
+#SBATCH -J amg
+#SBATCH --gpu-bind none
+#SBATCH -t 00:30:00
+#SBATCH -A csc569
+#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/%x-%j/output-AMG2023.log
+#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/%x-%j/error-AMG2023.log
+#SBATCH --exclusive
+# Run like: sbatch run_frontier_64.sh
+
+OUTPUT_DIR=/lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
+OUTPUT_FILE=$OUTPUT_DIR/output-AMG2023.log
+ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log
+
+# Run gpu benchmarks
+COMM_TYPE=mpi
+ROCM_VERSION=6.1.3
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+
+APP_ROOT=/ccs/home/keshprad/AMG2023
+cd $APP_ROOT
+
+# reset modules
+echo resetting modules:
+module reset
+# load modules
+echo loading modules:
+module load cray-mpich/8.1.30
+module load craype-accel-amd-gfx90a
+module load rocm/6.1.3
+
+export MPICH_GPU_SUPPORT_ENABLED=1
+export CRAY_ACCEL_TARGET=gfx90a
+export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/
+# mpiP
+export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH
+export MPIP="-o -f $OUTPUT_DIR"
+
+# log start date
+echo start AMG2023: $(date)
+# define command
+cmd="srun --output $OUTPUT_FILE --error $ERROR_FILE \
+        ./build/amg -P 8 8 8 -n 128 64 64 -problem 1 -iter 500"
+echo solving:
+echo $cmd
+$cmd
+# log end date
+echo end AMG2023: $(date)
diff --git a/AMG2023/run_frontier_crontab.sh b/AMG2023/run_frontier_crontab.sh
new file mode 100644
index 0000000..09b0f66
--- /dev/null
+++ b/AMG2023/run_frontier_crontab.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <number_of_nodes>"
+    exit 1
+fi
+# `16` or `64`
+NUM_NODES=$1
+
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+
+# load lmod
+source /usr/share/lmod/lmod/init/bash
+# load default LMOD_SYSTEM_DEFAULT_MODULES and MODULEPATH
+export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-trento:craype-network-ofi:perftools-base:xpmem:cray-pmi:PrgEnv-cray:DefApps
+export MODULEPATH=/sw/frontier/spack-envs/modules/cce/17.0.0/cray-mpich-8.1.28/cce-17.0.0:/sw/frontier/spack-envs/modules/cce/17.0.0/cce-17.0.0:/sw/frontier/spack-envs/modules/Core/24.07:/opt/cray/pe/lmod/modulefiles/mpi/crayclang/17.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/crayclang/17.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/compiler/crayclang/17.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-trento/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/sw/frontier/modulefiles:/opt/cray/modulefiles
+
+# run sbatch script
+script=$PERF_VARIABILITY_ROOT/AMG2023/run_frontier_$NUM_NODES\.sh
+sbatch $script
\ No newline at end of file
diff --git a/DeepCAM/README.md b/DeepCAM/README.md
new file mode 100644
index 0000000..94e6880
--- /dev/null
+++ b/DeepCAM/README.md
@@ -0,0 +1,131 @@
+# DeepCAM README
+For more detailed installation parameters, please refer to DeepCAM install guide
+
+Perlmutter Repository: [hpc_results_v3.0](https://github.com/hpcgroup/hpc_results_v3.0)  
+Frontier Repository: [hpc](https://github.com/hpcgroup/hpc)
+
+
+## Perlmutter Setup
+
+### Setup steps
+
+## Frontier Setup
+
+### Setup steps
+
+#### 1. Pytorch Install
+- Load modules
+    ```bash
+    module reset
+    module load PrgEnv-gnu/8.5.0
+    module load rocm/6.1.3
+    module load craype-accel-amd-gfx90a
+    module load cray-python/3.9.13.1
+- Create env variables
+    ```bash
+    DEEPCAM_ROOT=/lustre/orion/csc569/scratch/keshprad/deepcam/
+    PYVENV_ROOT=${DEEPCAM_ROOT}/.venv
+    PYVENV_SITEPKGS=${PYVENV_ROOT}/lib/python3.9/site-packages
+
+    cd ${DEEPCAM_ROOT}
+    ```
+- Create python virtual env
+    ```bash
+    python -m venv ${PYVENV_ROOT}
+    source ${PYVENV_ROOT}/bin/activate
+    ```
+- Install torch and mpi4py
+    ```bash
+    # torch==2.5.0
+    pip install torch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/rocm6.1
+
+    MPICC="cc -shared" pip install --no-cache-dir --no-binary=mpi4py mpi4py
+    ```
+- Install AWS-OCI-RCCL plugin
+    ```bash
+    mkdir -p ${DEEPCAM_ROOT}/repos
+    cd ${DEEPCAM_ROOT}/repos
+
+    rocm_version=6.1.3
+    # Load modules
+    module load PrgEnv-gnu/8.5.0
+    module load rocm/$rocm_version
+    module load craype-accel-amd-gfx90a
+    module load gcc-native/12.3
+    module load cray-mpich/8.1.30
+    #module load libtool
+    libfabric_path=/opt/cray/libfabric/1.15.2.0
+
+    # Download the plugin repo
+    git clone --recursive https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl
+    cd aws-ofi-rccl
+
+    # Build the plugin
+    ./autogen.sh
+    export LD_LIBRARY_PATH=/opt/rocm-$rocm_version/hip/lib:$LD_LIBRARY_PATH
+    PLUG_PREFIX=$PWD
+
+    CC=hipcc CFLAGS=-I/opt/rocm-$rocm_version/rccl/include ./configure \
+    --with-libfabric=$libfabric_path --with-rccl=/opt/rocm-$rocm_version --enable-trace \
+    --prefix=$PLUG_PREFIX --with-hip=/opt/rocm-$rocm_version/hip --with-mpi=$MPICH_DIR
+
+    make
+    make install
+
+    # Reminder to export the plugin to your path
+    echo $PLUG_PREFIX
+    echo "Add the following line in the environment to use the AWS OFI RCCL plugin"
+    echo "export LD_LIBRARY_PATH="$PLUG_PREFIX"/lib:$""LD_LIBRARY_PATH"
+    ```
+- Install supporting dependencies
+    ```bash
+    cd ${DEEPCAM_ROOT}
+
+    pip install wandb
+    pip install gym
+    pip install pyspark
+    pip install scikit-learn
+    pip install scikit-image
+    pip install opencv-python
+    pip install wheel
+    pip install tomli
+    pip install h5py
+
+    # tensorboard
+    pip install tensorboard
+    pip install tensorboard_plugin_profile
+    pip install tensorboard-plugin-wit
+    pip install tensorboard-pytorch
+
+    pip install git+https://github.com/ildoonet/pytorch-gradual-warmup-lr.git
+    ```
+- Install mlperf-logging
+    ```bash
+    mkdir -p ${DEEPCAM_ROOT}/repos
+    cd ${DEEPCAM_ROOT}/repos
+
+    git clone -b hpc-1.0-branch https://github.com/mlcommons/logging mlperf-logging
+    # may need to manually change mlperf-logging/VERSION to a valid version number (e.g. 1.0.0.rc2)
+    pip install -e mlperf-logging
+
+    rm ${PYVENV_SITEPKGS}/mlperf-logging.egg-link
+    cp -r ./mlperf-logging/mlperf_logging ${PYVENV_SITEPKGS}/mlperf_logging
+    cp -r ./mlperf-logging/mlperf_logging.egg-info ${PYVENV_SITEPKGS}/mlperf_logging.egg-info
+    ```
+
+#### 2. Download src code
+- Download from PSSG Frontier repo for DeepCAM (linked at top of README)
+    ```bash
+    # REPLACE WITH YOUR PATH
+    PRFX=/lustre/orion/csc569/scratch/keshprad
+    DEEPCAM_ROOT=${PRFX}/deepcam
+
+    mkdir -p ${DEEPCAM_ROOT}
+    cd ${DEEPCAM_ROOT}
+
+    git clone https://github.com/hpcgroup/hpc.git hpc
+    ```
+
+#### 3. Download dataset with globus
+- [Globus Link](https://app.globus.org/file-manager?origin_id=0b226e2c-4de0-11ea-971a-021304b0cca7&origin_path=%2F)
+    - Download to `$DEEPCAM_ROOT/data`
\ No newline at end of file
diff --git a/DeepCAM/run_frontier_16.sh b/DeepCAM/run_frontier_16.sh
new file mode 100644
index 0000000..593608a
--- /dev/null
+++ b/DeepCAM/run_frontier_16.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+#SBATCH -N 16
+#SBATCH -n 128
+#SBATCH -q normal
+#SBATCH -J deepcam
+#SBATCH --gpu-bind none
+#SBATCH -t 00:30:00
+#SBATCH -A csc569
+#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/16nodes/%x-%j/job-output.log
+#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/16nodes/%x-%j/job-error.log
+#SBATCH --exclusive
+# Run like: sbatch run_frontier_16.sh
+
+echo "start run: $(date)"
+export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
+OUTPUT_FILE=${JOB_OUTPUT_PATH}/output-deepcam.log
+ERROR_FILE=${JOB_OUTPUT_PATH}/error-deepcam.log
+
+export SCRATCH="/lustre/orion/csc569/scratch/keshprad"
+export APP_ROOT="${SCRATCH}/deepcam"
+APP_WORKING_DIR=${APP_ROOT}/hpc/deepcam/src/deepCam
+cd $APP_WORKING_DIR
+
+# load modules
+ROCM_VERSION=6.1.3
+echo resetting modules:
+module reset
+echo loading modules:
+module load PrgEnv-gnu/8.5.0
+module load rocm/6.1.3
+module load craype-accel-amd-gfx90a
+module load cray-python/3.9.13.1
+module list
+
+# activate virtual env
+echo activating virtual env:
+source ${APP_ROOT}/.venv/bin/activate
+
+# ENV variables
+echo setting env vars:
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 8 ))
+
+## master addr and port
+# setting variables for torch.distributed
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export WORLD_SIZE=$GPUS
+export OMP_NUM_THREADS=7 
+
+# Needed to bypass MIOpen, Disk I/O Errors
+export MIOPEN_USER_DB_PATH="/tmp/my-miopen-cache-${SLURM_JOB_ID}"
+export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH}
+
+## some RCCL env variables
+export FI_CXI_ATS=0
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn0
+export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+# AWS-OFI-RCCL
+export LD_LIBRARY_PATH=${APP_ROOT}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH
+# other
+export MPICH_GPU_SUPPORT_ENABLED=1
+export GPU_MAX_HW_QUEUES=1
+export OFI_NCCL_USE_IPV6_TCP=1
+
+# deepcam setup
+export RUN_TAG="${SLURM_JOB_NAME}-${SLURM_JOB_ID}"
+BENCH_RCP_FIXED="\
+    --gradient_accumulation_frequency 1 \
+    --logging_frequency 10 \
+    --save_frequency 0 \
+    --seed $(date +%s) \
+    --batchnorm_group_size 1 \
+    --target_iou 0.80"
+#BENCH_RCP_BASELINE_LR describes the learning rate for Baseline runs.
+#It should not be modified.
+BENCH_RCP_BASELINE_LR="\
+    --start_lr 0.0055 \
+    --lr_schedule type="multistep",milestones="800",decay_rate="0.1" \
+    --lr_warmup_steps 400 \
+    --lr_warmup_factor 1. \
+    --weight_decay 1e-2 \
+    --optimizer_betas 0.9 0.999"
+BENCH_RCP_BASELINE="\
+    ${BENCH_RCP_FIXED} \
+    ${BENCH_RCP_BASELINE_LR}"
+
+# define command
+MAX_EPOCHS=1
+cmd="srun --export=ALL --tasks-per-node=8 --gpus-per-node=8 \
+        --gpu-bind=closest --gpus-per-task=1 \
+        --cpu-bind=none --hint=nomultithread \
+        python train.py \
+            ${BENCH_RCP_BASELINE} \
+            --data_dir_prefix ${APP_ROOT}/data/All-Hist \
+            --run_tag ${RUN_TAG} \
+            --output_dir ${JOB_OUTPUT_PATH} \
+            --wireup_method nccl-slurm \
+            --max_epochs ${MAX_EPOCHS} \
+            --optimizer "Adam" \
+            --local_batch_size 2"
+
+# run with profiler
+export WITH_PROFILER=1
+OUTPUT_FILE="$JOB_OUTPUT_PATH/output-deepcam.log"
+# clear cache
+rm -rf ${MIOPEN_USER_DB_PATH}
+mkdir -p ${MIOPEN_USER_DB_PATH}
+# log start date
+echo "start deepcam: $(date)" &>> $OUTPUT_FILE
+# execute command
+echo $cmd &>> $OUTPUT_FILE
+eval $cmd &>> $OUTPUT_FILE
+# log end date
+echo "end deepcam: $(date)" &>> $OUTPUT_FILE
+
+rm -rf ${MIOPEN_USER_DB_PATH}
+
+# Run gpu benchmarks
+COMM_TYPE=rccl
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+
+echo "end run: $(date)"
\ No newline at end of file
diff --git a/DeepCAM/run_frontier_64.sh b/DeepCAM/run_frontier_64.sh
new file mode 100644
index 0000000..5c406fe
--- /dev/null
+++ b/DeepCAM/run_frontier_64.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+#SBATCH -N 64
+#SBATCH -n 512
+#SBATCH -q normal
+#SBATCH -J deepcam
+#SBATCH --gpu-bind none
+#SBATCH -t 00:30:00
+#SBATCH -A csc569
+#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/64nodes/%x-%j/job-output.log
+#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/64nodes/%x-%j/job-error.log
+#SBATCH --exclusive
+# Run like: sbatch run_frontier_64.sh
+
+echo "start run: $(date)"
+export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/deepcam_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
+OUTPUT_FILE=${JOB_OUTPUT_PATH}/output-deepcam.log
+ERROR_FILE=${JOB_OUTPUT_PATH}/error-deepcam.log
+
+export SCRATCH="/lustre/orion/csc569/scratch/keshprad"
+export APP_ROOT="${SCRATCH}/deepcam"
+APP_WORKING_DIR=${APP_ROOT}/hpc/deepcam/src/deepCam
+cd $APP_WORKING_DIR
+
+# load modules
+ROCM_VERSION=6.1.3
+echo resetting modules:
+module reset
+echo loading modules:
+module load PrgEnv-gnu/8.5.0
+module load rocm/6.1.3
+module load craype-accel-amd-gfx90a
+module load cray-python/3.9.13.1
+module list
+
+# activate virtual env
+echo activating virtual env:
+source ${APP_ROOT}/.venv/bin/activate
+
+# ENV variables
+echo setting env vars:
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 8 ))
+
+## master addr and port
+# setting variables for torch.distributed
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export WORLD_SIZE=$GPUS
+export OMP_NUM_THREADS=7 
+
+# Needed to bypass MIOpen, Disk I/O Errors
+export MIOPEN_USER_DB_PATH="/tmp/my-miopen-cache-${SLURM_JOB_ID}"
+export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH}
+
+## some RCCL env variables
+export FI_CXI_ATS=0
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn0
+export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+# AWS-OFI-RCCL
+export LD_LIBRARY_PATH=${APP_ROOT}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH
+# other
+export MPICH_GPU_SUPPORT_ENABLED=1
+export GPU_MAX_HW_QUEUES=1
+export OFI_NCCL_USE_IPV6_TCP=1
+
+# deepcam setup
+export RUN_TAG="${SLURM_JOB_NAME}-${SLURM_JOB_ID}"
+BENCH_RCP_FIXED="\
+    --gradient_accumulation_frequency 1 \
+    --logging_frequency 10 \
+    --save_frequency 0 \
+    --seed $(date +%s) \
+    --batchnorm_group_size 1 \
+    --target_iou 0.80"
+#BENCH_RCP_BASELINE_LR describes the learning rate for Baseline runs.
+#It should not be modified.
+BENCH_RCP_BASELINE_LR="\
+    --start_lr 0.0055 \
+    --lr_schedule type="multistep",milestones="800",decay_rate="0.1" \
+    --lr_warmup_steps 400 \
+    --lr_warmup_factor 1. \
+    --weight_decay 1e-2 \
+    --optimizer_betas 0.9 0.999"
+BENCH_RCP_BASELINE="\
+    ${BENCH_RCP_FIXED} \
+    ${BENCH_RCP_BASELINE_LR}"
+
+# define command
+MAX_EPOCHS=4
+cmd="srun --export=ALL --tasks-per-node=8 --gpus-per-node=8 \
+        --gpu-bind=closest --gpus-per-task=1 \
+        --cpu-bind=none --hint=nomultithread \
+        python train.py \
+            ${BENCH_RCP_BASELINE} \
+            --data_dir_prefix ${APP_ROOT}/data/All-Hist \
+            --run_tag ${RUN_TAG} \
+            --output_dir ${JOB_OUTPUT_PATH} \
+            --wireup_method nccl-slurm \
+            --max_epochs ${MAX_EPOCHS} \
+            --optimizer "Adam" \
+            --local_batch_size 2"
+
+# run with profiler
+export WITH_PROFILER=1
+OUTPUT_FILE="$JOB_OUTPUT_PATH/output-deepcam.log"
+# clear cache
+rm -rf ${MIOPEN_USER_DB_PATH}
+mkdir -p ${MIOPEN_USER_DB_PATH}
+# log start date
+echo "start deepcam: $(date)" &>> $OUTPUT_FILE
+# execute command
+echo $cmd &>> $OUTPUT_FILE
+eval $cmd &>> $OUTPUT_FILE
+# log end date
+echo "end deepcam: $(date)" &>> $OUTPUT_FILE
+
+rm -rf ${MIOPEN_USER_DB_PATH}
+
+# Run gpu benchmarks
+COMM_TYPE=rccl
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+
+echo "end run: $(date)"
\ No newline at end of file
diff --git a/DeepCAM/run_frontier_crontab.sh b/DeepCAM/run_frontier_crontab.sh
new file mode 100644
index 0000000..6d70161
--- /dev/null
+++ b/DeepCAM/run_frontier_crontab.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <number_of_nodes>"
+    exit 1
+fi
+# `16` or `64`
+NUM_NODES=$1
+
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+
+# load lmod
+source /usr/share/lmod/lmod/init/bash
+# load default LMOD_SYSTEM_DEFAULT_MODULES and MODULEPATH
+export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-trento:craype-network-ofi:perftools-base:xpmem:cray-pmi:PrgEnv-cray:DefApps
+export MODULEPATH=/sw/frontier/spack-envs/modules/cce/17.0.0/cray-mpich-8.1.28/cce-17.0.0:/sw/frontier/spack-envs/modules/cce/17.0.0/cce-17.0.0:/sw/frontier/spack-envs/modules/Core/24.07:/opt/cray/pe/lmod/modulefiles/mpi/crayclang/17.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/crayclang/17.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/compiler/crayclang/17.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-trento/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/sw/frontier/modulefiles:/opt/cray/modulefiles
+
+# run sbatch script
+script=$PERF_VARIABILITY_ROOT/DeepCAM/run_frontier_$NUM_NODES\.sh
+sbatch $script
\ No newline at end of file
diff --git a/gpu-benchmarks/README.md b/gpu-benchmarks/README.md
new file mode 100644
index 0000000..c8f9c25
--- /dev/null
+++ b/gpu-benchmarks/README.md
@@ -0,0 +1,14 @@
+# gpu-benchmarks README
+Code Repository: [gpu-benchmarks](#TODO:)
+
+## Perlmutter Compilation
+
+### Steps to Compile
+
+TODO:
+
+## Frontier Compilation
+
+### Steps to Compile
+
+TODO:
\ No newline at end of file
diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh
new file mode 100644
index 0000000..7fc10b4
--- /dev/null
+++ b/gpu-benchmarks/allgather/run_frontier.sh
@@ -0,0 +1,63 @@
+# This script assumes it is being run by another sbatch script, 
+# so does not include portions for SBATCH vars (e.g. account, time, etc.)
+
+# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allgather.sh <comm_type> <num_nodes> <output_dir>
+
+#!/bin/bash
+if [ "$#" -ne 4 ]; then
+    echo "Usage: $0 <communication_type> <rocm_version> <number_of_nodes> <output_dir>"
+    exit 1
+fi
+# `mpi` or `rccl`
+COMM_TYPE=$1
+# `5.7.1` or `6.1.3`
+ROCM_VERSION=$2
+# `16` or `64`
+NUM_NODES=$3
+# output directory
+OUTPUT_DIR=$4
+
+# setup cray-mpich version
+if [[ "$ROCM_VERSION" == "6.1.3" ]]; then
+    MPICH_VERSION=8.1.30
+else
+    MPICH_VERSION=8.1.28
+fi
+
+OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log
+
+{
+    # reset modules
+    echo resetting modules:
+    module reset
+    # load modules
+    echo loading modules:
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION}
+    module load cray-mpich/${MPICH_VERSION}
+    module load rocm/${ROCM_VERSION}
+    module list
+
+    GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
+    EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE\_rocm-${ROCM_VERSION}.x
+    NUM_TASKS=$(($NUM_NODES * 8))
+    MIN_MSG_SIZE=$((1 * 1024))
+    MAX_MSG_SIZE=$((1 * 1024 * 1024))
+    ITERATIONS=100
+
+    export MPICH_GPU_SUPPORT_ENABLED=1
+    export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
+
+    echo start allgather: $(date)
+    For MPI-bench we should use --gpus-per-node --gpus-per-task --ntasks-per-node , and  --gpu-bind=none in srun.
+    CMD="srun -N $NUM_NODES -n $NUM_TASKS \
+            --gpus-per-node 8 \
+            --gpus-per-task 1 \
+            --ntasks-per-node 8 \
+            --gpu-bind none \
+            --output $OUTPUT_FILE \
+            $EXEC $NUM_TASKS $MIN_MSG_SIZE $MAX_MSG_SIZE $ITERATIONS"
+    echo running:
+    echo $CMD
+    $CMD
+    echo end allgather: $(date)
+} &>> $OUTPUT_FILE
diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh
new file mode 100644
index 0000000..855a486
--- /dev/null
+++ b/gpu-benchmarks/allreduce/run_frontier.sh
@@ -0,0 +1,58 @@
+# This script assumes it is being run by another sbatch script, 
+# so does not include portions for SBATCH vars (e.g. account, time, etc.)
+
+# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allreduce.sh <comm_type> <num_nodes> <output_dir>
+
+#!/bin/bash
+if [ "$#" -ne 4 ]; then
+    echo "Usage: $0 <communication_type> <rocm_version> <number_of_nodes> <output_dir>"
+    exit 1
+fi
+# `mpi` or `rccl`
+COMM_TYPE=$1
+# `5.7.1` or `6.1.3`
+ROCM_VERSION=$2
+# `16` or `64`
+NUM_NODES=$3
+# output directory
+OUTPUT_DIR=$4
+
+# setup cray-mpich version
+if [[ "$ROCM_VERSION" == "6.1.3" ]]; then
+    MPICH_VERSION=8.1.30
+else
+    MPICH_VERSION=8.1.28
+fi
+
+OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log
+
+{
+    # reset modules
+    echo resetting modules:
+    module reset
+    # load modules
+    echo loading modules:
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION}
+    module load cray-mpich/${MPICH_VERSION}
+    module load rocm/${ROCM_VERSION}
+    module list
+
+    GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
+    EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE\_rocm-${ROCM_VERSION}.x
+    NUM_TASKS=$(($NUM_NODES * 8))
+    MIN_MSG_SIZE=$((1 * 1024))
+    MAX_MSG_SIZE=$((1 * 1024 * 1024))
+    ITERATIONS=100
+
+    export MPICH_GPU_SUPPORT_ENABLED=1
+    export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
+
+    echo start allreduce: $(date)
+    CMD="srun -N $NUM_NODES -n $NUM_TASKS \
+            --output $OUTPUT_FILE \
+            $EXEC $NUM_TASKS $MIN_MSG_SIZE $MAX_MSG_SIZE $ITERATIONS"
+    echo running:
+    echo $CMD
+    $CMD
+    echo end allreduce: $(date)
+} &>> $OUTPUT_FILE
diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh
new file mode 100644
index 0000000..c5348be
--- /dev/null
+++ b/gpu-benchmarks/gemm/run_frontier.sh
@@ -0,0 +1,56 @@
+# This script assumes it is being run by another sbatch script, 
+# so does not include portions for SBATCH vars (e.g. account, time, etc.)
+
+# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/gemm.sh <num_nodes> <output_dir>
+
+#!/bin/bash
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <rocm_version> <number_of_nodes> <output_dir>"
+    exit 1
+fi
+# `5.7.1` or `6.1.3`
+ROCM_VERSION=$1
+# `16` or `64`
+NUM_NODES=$2
+# output directory
+OUTPUT_DIR=$3
+
+# setup cray-mpich version
+if [[ "$ROCM_VERSION" == "6.1.3" ]]; then
+    MPICH_VERSION=8.1.30
+else
+    MPICH_VERSION=8.1.28
+fi
+
+OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log
+
+{
+    # reset modules
+    echo resetting modules:
+    module reset
+    # load modules
+    echo loading modules:
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION}
+    module load cray-mpich/${MPICH_VERSION}
+    module load rocm/${ROCM_VERSION}
+    module list
+
+    GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
+    EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm_rocm-${ROCM_VERSION}.x
+    NUM_TASKS=$(($NUM_NODES * 8))
+
+    export MPICH_GPU_SUPPORT_ENABLED=1
+    export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
+
+    echo start gemm: $(date)
+    CMD="srun -N $NUM_NODES -n $NUM_TASKS \
+            --gpus-per-node 8 \
+            --gpus-per-task 1 \
+            --ntasks-per-node 8 \
+            --output $OUTPUT_FILE \
+            $EXEC"
+    echo running:
+    echo $CMD
+    $CMD
+    echo end gemm: $(date)
+} &>> $OUTPUT_FILE
diff --git a/nanoGPT/README.md b/nanoGPT/README.md
index 5c499fc..87e8189 100644
--- a/nanoGPT/README.md
+++ b/nanoGPT/README.md
@@ -1,33 +1,62 @@
-# nanoGPT Setup Instructions
+# nanoGPT README
+For more detailed installation parameters, please refer to [nanoGPT install guide](https://github.com/axonn-ai/nanoGPT).
 
-## Clone the Repository
+Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/)
 
-```sh
-git clone https://github.com/axonn-ai/nanoGPT.git
-```
 
-## Create Python Environment
+## Perlmutter Setup
 
-```sh
-./scripts/create_python_env_perlmutter.sh
-```
+### Setup steps
 
-> Note: You may need to modify the path and torch version in `create_python_env_perlmutter.sh`.
+1. Clone the Repository
+    ```sh
+    git clone https://github.com/axonn-ai/nanoGPT.git
+    cd nanoGPT
+    ```
 
-## Load PyTorch Module
+2.  Create Python Environment
+    ```sh
+    ./scripts/create_python_env_perlmutter.sh
+    ```
+    > Note: You may need to modify the path and torch version in `create_python_env_perlmutter.sh`.
 
-```sh
-module load pytorch/2.0.1
-```
+3. Load PyTorch Module
+    ```sh
+    module load pytorch/2.0.1
+    ```
 
-## Activate the Environment
+4. Activate the Environment
+    ```sh
+    source path_to_nanogptENV/bin/activate
+    ```
 
-```sh
-source path_to_nanogptENV/bin/activate
-```
+5. Download Data
+    ```sh
+    python nanoGPT/data/openwebtext/prepare.py
+    ```
 
-## Download Data
+## Frontier Setup
 
-```sh
-python nanoGPT/data/openwebtext/prepare.py
-```
\ No newline at end of file
+### Setup steps
+
+1. Clone the Repository
+    ```sh
+    git clone https://github.com/axonn-ai/nanoGPT.git
+    cd nanoGPT
+    ```
+
+2.  Create Python Environment
+    ```sh
+    ./scripts/create_python_env_frontier.sh
+    ```
+    > Note: You may need to modify the WKSPC path and torch version in `create_python_env_frontier.sh`.
+
+4. Activate the Environment
+    ```sh
+    source path_to_nanogptENV/bin/activate
+    ```
+
+5. Download Data
+    ```sh
+    python data/openwebtext/prepare.py
+    ```
\ No newline at end of file
diff --git a/nanoGPT/run_frontier16.sh b/nanoGPT/run_frontier16.sh
new file mode 100644
index 0000000..901561e
--- /dev/null
+++ b/nanoGPT/run_frontier16.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+#SBATCH -N 16
+#SBATCH -n 128
+#SBATCH -q normal
+#SBATCH -J nanogpt
+#SBATCH --gpu-bind none
+#SBATCH -t 00:30:00
+#SBATCH -A csc569
+#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-output.log
+#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-error.log
+#SBATCH --exclusive
+# Run like: sbatch run_frontier16.sh
+
+echo "start run: $(date)"
+export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
+OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log
+ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log
+
+export SCRATCH="/lustre/orion/csc569/scratch/keshprad"
+export WRKSPC="${SCRATCH}/nanoGPT"
+export HF_HOME="${SCRATCH}/.cache/hf"
+export HF_TRANSFORMERS_CACHE="${HF_HOME}"
+export HF_DATASETS_CACHE="${HF_HOME}/datasets"
+cd $WRKSPC
+
+# load modules
+ROCM_VERSION=6.1.3
+echo resetting modules:
+module reset
+echo loading modules:
+module load PrgEnv-gnu/8.5.0
+module load rocm/${ROCM_VERSION}
+module load craype-accel-amd-gfx90a
+module load cray-python/3.9.13.1
+module load cray-mpich/8.1.30
+module list
+# activate env
+source ${WRKSPC}/axonn_nanogpt/bin/activate
+
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 8 ))
+## master addr and port
+# setting variables for torch.distributed
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export WORLD_SIZE=$GPUS
+export OMP_NUM_THREADS=7
+
+## some RCCL env variables
+export FI_CXI_ATS=0
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn0
+export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+# AWS-OFI-RCCL
+export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH"
+# other
+export MPICH_GPU_SUPPORT_ENABLED=1
+export GPU_MAX_HW_QUEUES=1
+export OFI_NCCL_USE_IPV6_TCP=1
+
+SCRIPT="train_frontier.py config/train_gpt_neox_5B.py"
+
+# run with profiler
+export WITH_PROFILER=1
+OUTPUT_FILE="$JOB_OUTPUT_PATH/output-nanoGPT.log"
+# log start date
+echo "start nanoGPT: $(date)" &>> $OUTPUT_FILE
+run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT"
+echo $run_cmd &>> $OUTPUT_FILE
+eval $run_cmd &>> $OUTPUT_FILE
+# log end date
+echo "end nanoGPT: $(date)" &>> $OUTPUT_FILE
+
+# Run gpu benchmarks
+COMM_TYPE=rccl
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+
+echo "end run: $(date)"
\ No newline at end of file
diff --git a/nanoGPT/run_frontier64.sh b/nanoGPT/run_frontier64.sh
new file mode 100644
index 0000000..3201b51
--- /dev/null
+++ b/nanoGPT/run_frontier64.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+#SBATCH -N 64
+#SBATCH -n 512
+#SBATCH -q normal
+#SBATCH -J nanogpt
+#SBATCH --gpu-bind none
+#SBATCH -t 00:30:00
+#SBATCH -A csc569
+#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-output.log
+#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-error.log
+#SBATCH --exclusive
+# Run like: sbatch run_frontier64.sh
+
+echo "start run: $(date)"
+export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
+OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log
+ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log
+
+export SCRATCH="/lustre/orion/csc569/scratch/keshprad"
+export WRKSPC="${SCRATCH}/nanoGPT"
+export HF_HOME="${SCRATCH}/.cache/hf"
+export HF_TRANSFORMERS_CACHE="${HF_HOME}"
+export HF_DATASETS_CACHE="${HF_HOME}/datasets"
+cd $WRKSPC
+
+# load modules
+ROCM_VERSION=6.1.3
+echo resetting modules:
+module reset
+echo loading modules:
+module load PrgEnv-gnu/8.5.0
+module load rocm/${ROCM_VERSION}
+module load craype-accel-amd-gfx90a
+module load cray-python/3.9.13.1
+module load cray-mpich/8.1.30
+module list
+# activate env
+source ${WRKSPC}/axonn_nanogpt/bin/activate
+
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 8 ))
+## master addr and port
+# setting variables for torch.distributed
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export WORLD_SIZE=$GPUS
+export OMP_NUM_THREADS=7
+
+## some RCCL env variables
+export FI_CXI_ATS=0
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn0
+export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+# AWS-OFI-RCCL
+export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH"
+# other
+export MPICH_GPU_SUPPORT_ENABLED=1
+export GPU_MAX_HW_QUEUES=1
+export OFI_NCCL_USE_IPV6_TCP=1
+
+SCRIPT="train_frontier.py config/train_gpt_neox_20B.py"
+
+# run with profiler
+export WITH_PROFILER=1
+OUTPUT_FILE="$JOB_OUTPUT_PATH/output-nanoGPT.log"
+# log start date
+echo "start nanoGPT: $(date)" &>> $OUTPUT_FILE
+run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT"
+echo $run_cmd &>> $OUTPUT_FILE
+eval $run_cmd &>> $OUTPUT_FILE
+# log end date
+echo "end nanoGPT: $(date)" &>> $OUTPUT_FILE
+
+# Run gpu benchmarks
+COMM_TYPE=rccl
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+
+echo "end run: $(date)"
\ No newline at end of file
diff --git a/nanoGPT/run_frontier_crontab.sh b/nanoGPT/run_frontier_crontab.sh
new file mode 100644
index 0000000..dcc8cf5
--- /dev/null
+++ b/nanoGPT/run_frontier_crontab.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <number_of_nodes>"
+    exit 1
+fi
+# `16` or `64`
+NUM_NODES=$1
+
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+
+# load lmod
+source /usr/share/lmod/lmod/init/bash
+# load default LMOD_SYSTEM_DEFAULT_MODULES and MODULEPATH
+export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-trento:craype-network-ofi:perftools-base:xpmem:cray-pmi:PrgEnv-cray:DefApps
+export MODULEPATH=/sw/frontier/spack-envs/modules/cce/17.0.0/cray-mpich-8.1.28/cce-17.0.0:/sw/frontier/spack-envs/modules/cce/17.0.0/cce-17.0.0:/sw/frontier/spack-envs/modules/Core/24.07:/opt/cray/pe/lmod/modulefiles/mpi/crayclang/17.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/crayclang/17.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/compiler/crayclang/17.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-trento/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/sw/frontier/modulefiles:/opt/cray/modulefiles
+
+# run sbatch script
+script=$PERF_VARIABILITY_ROOT/nanoGPT/run_frontier$NUM_NODES\.sh
+sbatch $script
\ No newline at end of file
diff --git a/nanoGPT/train_gpt_neox_20B_frontier.py b/nanoGPT/train_gpt_neox_20B_frontier.py
new file mode 100644
index 0000000..cf7b91f
--- /dev/null
+++ b/nanoGPT/train_gpt_neox_20B_frontier.py
@@ -0,0 +1,46 @@
+# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB
+# launch as the following (e.g. in a screen session) and wait ~5 days:
+# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py
+
+wandb_log = False
+wandb_project = 'owt'
+wandb_run_name='gpt2-124M'
+
+# these make the total batch size be ~0.5M
+# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
+batch_size = 8
+block_size = 512
+gradient_accumulation_steps = 1 * 512 #per_gpu x num_gpus
+
+# model
+n_layer = 32
+n_head = 56
+n_embd = 7168
+dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
+bias = False # do we use bias inside LayerNorm and Linear layers?
+
+# adamw optimizer
+learning_rate = 1e-4 # max learning rate
+max_iters = 30 # total number of training iterations
+
+# axonn params
+G_intra_d=16
+G_intra_c=1
+G_intra_r=1
+compile=False # disable compile for axonn
+gradient_checkpointing=True
+
+# this makes total number of tokens be 300B
+max_iters = 30
+lr_decay_iters = 600000
+
+# eval stuff
+eval_interval = 1000
+eval_iters = 1
+log_interval = 10
+
+# weight decay
+weight_decay = 1e-1
+
+# log every iteration
+log_interval=1
\ No newline at end of file
diff --git a/nanoGPT/train_gpt_neox_5B_frontier.py b/nanoGPT/train_gpt_neox_5B_frontier.py
new file mode 100644
index 0000000..4ce7b55
--- /dev/null
+++ b/nanoGPT/train_gpt_neox_5B_frontier.py
@@ -0,0 +1,46 @@
+# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB
+# launch as the following (e.g. in a screen session) and wait ~5 days:
+# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py
+
+wandb_log = False
+wandb_project = 'owt'
+wandb_run_name='gpt2-124M'
+
+# these make the total batch size be ~0.5M
+# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
+batch_size = 16
+block_size = 512
+gradient_accumulation_steps = 2 * 128 #per_gpu x num_gpus
+
+# model
+n_layer = 24
+n_head = 32
+n_embd = 4096
+dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
+bias = False # do we use bias inside LayerNorm and Linear layers?
+
+# adamw optimizer
+learning_rate = 1e-4 # max learning rate
+max_iters = 30 # total number of training iterations
+
+# axonn params
+G_intra_d=16
+G_intra_c=1
+G_intra_r=1
+compile=False # disable compile for axonn
+gradient_checkpointing=True
+
+# this makes total number of tokens be 300B
+max_iters = 30
+lr_decay_iters = 600000
+
+# eval stuff
+eval_interval = 1000
+eval_iters = 1
+log_interval = 10
+
+# weight decay
+weight_decay = 1e-1
+
+# log every iteration
+log_interval=1
\ No newline at end of file